# Data acquisition

## Plain text file handling

In [63]:
# imports
import csv
import numpy as np
import pandas as pd
import openpyxl
import matplotlib.cbook as cbook

In [11]:
# Modes:
# w - write
# r - read
# a - append
# r+ - read and write

f = open('testfile.txt', 'w')
print(f)

<_io.TextIOWrapper name='testfile.txt' mode='w' encoding='cp1252'>


In [12]:
f.write('This is first line.\nThis is second line.\n')
f.write('This is the last line.')

22

In [13]:
f.close()

In [14]:
f = open('testfile.txt', 'a')
f.write('\nThis is appended line.')
f.close()

In [26]:
f = open('testfile.txt', 'r')
print(f.read())
f.close()

This is first line.
This is second line.
This is the last line.
This is appended line.


In [27]:
f = open('testfile.txt', 'r')
print(f.readlines())
f.close()

['This is first line.\n', 'This is second line.\n', 'This is the last line.\n', 'This is appended line.']


In [28]:
f = open('testfile.txt', 'r')
for line in f:
    print(line)
f.close()

This is first line.

This is second line.

This is the last line.

This is appended line.


## Handling CSV files

In [32]:
file = open('test.csv', 'w')
file.close()

In [49]:
file = open('test.csv', 'r')
print(file)
print(file.read())
file.close()

<_io.TextIOWrapper name='test.csv' mode='r' encoding='cp1252'>
Banana,Yellow,250
Orange,Orange,200
Grapes,Green,400
Tomato,Red,100
Spinach,Green,40
Potatoes,Grey,400
Rice,White,300
Rice,Brown,400
Wheat,Brown,500
Barley,Yellow,500



In [54]:
file = open('test.csv', 'r')
csvfile = csv.reader(file, delimiter = ',')
print(csvfile)

<_csv.reader object at 0x00000237C036DCA0>


In [55]:
row_count = 0
for row in csvfile:
    row_count += 1
    print(row)
print(row_count)
file.close()

['Banana', 'Yellow', '250']
['Orange', 'Orange', '200']
['Grapes', 'Green', '400']
['Tomato', 'Red', '100']
['Spinach', 'Green', '40']
['Potatoes', 'Grey', '400']
['Rice', 'White', '300']
['Rice', 'Brown', '400']
['Wheat', 'Brown', '500']
['Barley', 'Yellow', '500']
10


In [62]:
file = open('test.csv', 'r')
csvfile = csv.reader(file, delimiter = ',')
print(file, csvfile)

<_io.TextIOWrapper name='test.csv' mode='r' encoding='cp1252'> <_csv.reader object at 0x00000237C03792E0>


In [63]:
elements_count = 0
for row in csvfile:
    for element in row:
        print(element)
        elements_count += 1
file.close()
print("Total number of elements in csv file: ", elements_count)

Banana
Yellow
250
Orange
Orange
200
Grapes
Green
400
Tomato
Red
100
Spinach
Green
40
Potatoes
Grey
400
Rice
White
300
Rice
Brown
400
Wheat
Brown
500
Barley
Yellow
500
Total number of elements in csv file:  30


## Python and Excel: Convert csv to xlsx

In [6]:
df = pd.read_csv(r'test.csv')
df.to_csv("test.csv", header=["Food item", "Color", "Weight"], index=False)

df.to_excel(r'test.xlsx', index = None, header=True)

In [7]:
import openpyxl

In [25]:
wb = openpyxl.load_workbook('test.xlsx')
print(wb)
print(type(wb))

<openpyxl.workbook.workbook.Workbook object at 0x0000013830334490>
<class 'openpyxl.workbook.workbook.Workbook'>


In [20]:
print(wb.sheetnames) # returns a list of sheets

['Sheet1', 'Sheet2', 'Sheet3']


In [17]:
currSheet = wb['Sheet1']
print(currSheet)
print(type(currSheet))

<Worksheet "Sheet1">
<class 'openpyxl.worksheet.worksheet.Worksheet'>


In [26]:
currSheet = wb[wb.sheetnames[0]]
print(currSheet)
print(type(currSheet))
print(currSheet.title)

<Worksheet "Sheet1">
<class 'openpyxl.worksheet.worksheet.Worksheet'>
Sheet1


In [27]:
var1 = currSheet['A1']
print(var1.value)

Food item


In [28]:
print(currSheet['B1'].value)

Color


In [29]:
var2 = currSheet.cell(row = 2, column = 2)
print(var2.value)

Orange


In [31]:
print(currSheet.max_row)
print(currSheet.max_column)

10
3


In [32]:
var3 = currSheet.cell(row = 10, column = 3)
print(var3.value)

500


In [44]:
for row in range (currSheet.max_row):
    print("---Beginning of row---")
    for column in range(currSheet.max_column):
        var = currSheet.cell(row = row + 1, column = column + 1)
        print(var.value)
    print("---End of row---\n")

---Beginning of row---
Food item
Color
Weight
---End of row---

---Beginning of row---
Orange
Orange
200
---End of row---

---Beginning of row---
Grapes
Green
400
---End of row---

---Beginning of row---
Tomato
Red
100
---End of row---

---Beginning of row---
Spinach
Green
40
---End of row---

---Beginning of row---
Potatoes
Grey
400
---End of row---

---Beginning of row---
Rice
White
300
---End of row---

---Beginning of row---
Rice
Brown
400
---End of row---

---Beginning of row---
Wheat
Brown
500
---End of row---

---Beginning of row---
Barley
Yellow
500
---End of row---



## Numpy writing and reading files

In [47]:
x = np.arange(100)
print(x)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]


In [48]:
np.save('test.npy', x)

In [51]:
data = np.load('test.npy')
print(data)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]


In [59]:
data_csv = np.loadtxt('data.csv', delimiter = ',')
print(data_csv)
print(type(data_csv))

[[  0.   1.  18.   2.]
 [  1.  66.   1.   3.]
 [  2.   3. 154.   6.]
 [  4. 978.   3.   6.]
 [  5.   2.  41.  45.]
 [  6.  67.   2.   3.]
 [  7.   5.  67.   2.]]
<class 'numpy.ndarray'>


In [62]:
data_csv = np.loadtxt('data.csv', delimiter = ',',
                     skiprows = 3, usecols = [1, 3]) # skip first 3 rows, and use 1 and 3 column
print(data_csv)

[[978.   6.]
 [  2.  45.]
 [ 67.   3.]
 [  5.   2.]]


## Matplotlib CBook

In [69]:
# Used for reading built-in data files
datafile = cbook.get_sample_data('aapl.npz')
r = np.load(datafile)

  datafile = cbook.get_sample_data('aapl.npz')


In [65]:
print(r.files)

['price_data']


In [67]:
print(r['price_data'])

[('1984-09-07',  26.5 ,  26.87,  26.25,  26.5 ,  2981600,   3.02)
 ('1984-09-10',  26.5 ,  26.62,  25.87,  26.37,  2346400,   3.01)
 ('1984-09-11',  26.62,  27.37,  26.62,  26.87,  5444000,   3.07) ...
 ('2008-10-10',  85.7 , 100.  ,  85.  ,  96.8 , 79260700,  96.8 )
 ('2008-10-13', 104.55, 110.53, 101.02, 110.26, 54967000, 110.26)
 ('2008-10-14', 116.26, 116.4 , 103.14, 104.08, 70749800, 104.08)]
