# Importing Data in Python

Table of Contents
* [Text Files](#text)
* [Flat files](#flat)
* [Pickled Files](#pickled)
* [Excel Files](#excel)
* [SAS files](#sas)
* [HDF5 files](#hdf5)
* [MATLAB files](#matlab)

## <a name="text"><a/>Text Files

In [None]:
filename = 'huck_finn.txt'

file = open(filename, mode = 'r') # r is read; w is write

text = file.read()

file.close()

Create a context/clause avoids the need to use 'file.close()'

In [None]:
with open('huck_finn.txt', 'r') as file:
    print(file.read())
    print(file.readline())

## <a name="flat"><a/>Flat Files
Text files containing records. Rows represent records, and columns contain attributes.

In [None]:
import pandas as pd

filename = 'winequality-red.csv'

data = pd.read_csv(filename
                  ,header = None  # if no header present
                  ,rows = 5  # read the first five rows only
                   , comment = '#'  # if comments are present
                   , na_values = 'Nothing'  # in this example we set a custom string
                  )

data.head()

data_array = data.values  # creates a numpy array, rather than importing with NumPy

## <a name="pickled"><a/>Pickled Files
Pickling a file, means to serialize it, converting the object into a sequence of bytes.

In [None]:
import pickle

with open('data.pkl'
         , mode = 'rb'  # setting it to read, and as binary
         ) as file:
    d = pickle.load(file)
    
print(d)

## <a name="excel"><a/>Excel Files

In [None]:
import pandas as pd

file = 'battledeath.xlsx'

xls = pd.ExcelFile(file)

print(xls.sheet_names)  # will list the sheet names of the excel file

# You can either load a sheet into a DataFrame by name or index
df1 = xls.parse('2004')  # by name
df2 = xls.parse(0)  # by index

# As with other imports there are several options
df1 = xls.parse(0
               , skiprows = [1]  # skip the first row, needs to be in list format
               , names = ['Country', 'AAM due to War (2002)']  # rename columns, needs to be list
               , usecols = [0,1]  # specify columns, needs to be list
               )

## <a name="sas"><a/>SAS files

In [None]:
from sas7bdat import SAS7BDAT

with SAS7BDAT('sales.sas7bdata') as file:
    df_sas = file.to_data_frame()
    
print(df_sas.head())

## <a name="stata"><a/>Stata files

In [None]:
import pandas as pd

data = pd.read_stata('urbanpop.dta')

## <a name="hdf5"><a/>HDF5 files
For very large hierarchical datasets

In [None]:
import numpy as np
import h5py

file = 'LIGO_data.hdf5'

data = h5py.File(file, 'r')
print(type(data))

# Print the keys of the file
for key in data.keys():
    print(key)
#>>> meta, quality, strain

group = data['strain']

# Print the keys of the group
for key in group.keys():
    print(key)
#>>> Strain

# Set variable equal to time series data: strain
strain = data['strain']['Strain'].value

## <a name="matlab"><a/>MATLAB files

In [None]:
import scipy.io

mat = scipy.io.loadmat('albeck_gene_expression.mat')
print(type(mat))

# Print the keys
print(mat.keys())
#>>>dict_keys(['__header__', '__version__', '__globals__', 'rfpCyt', 'rfpNuc', 'cfpNuc', 'cfpCyt', 'yfpNuc', 'yfpCyt', 'CYratioCyt'])

print(type(mat['CYratioCyt']))
#>>> <class 'numpy.ndarray'

import numpy as np
print(np.shape(mat['CYratioCyt']))