## Importing Data in Python

### CSV files

In [5]:
import numpy as np

data = np.genfromtxt('datasets/bike_sharing.csv', delimiter=',', dtype='str')
data[:5]

array([['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
        'atemp', 'humidity', 'windspeed', 'casual', 'registered',
        'count'],
       ['1/1/2011 0:00', '1', '0', '0', '1', '9.84', '14.395', '81', '0',
        '3', '13', '16'],
       ['1/1/2011 1:00', '1', '0', '0', '1', '', '', '', '0', '8', '32',
        '40'],
       ['1/1/2011 2:00', '1', '0', '0', '1', '9.02', '13.635', '80', '0',
        '5', '27', '32'],
       ['1/1/2011 3:00', '1', '0', '0', '1', '9.84', '', '75', '0', '3',
        '10', '13']], dtype='<U16')

In [6]:
kwargs = dict(delimiter=",",
                  dtype=float,
                  skip_header=1,
                  usecols=np.arange(1,12),
                  missing_values={np.nan},
                  filling_values=0
             )

data = np.genfromtxt('datasets/bike_sharing.csv', **kwargs)
data[:5]

array([[ 1.   ,  0.   ,  0.   ,  1.   ,  9.84 , 14.395, 81.   ,  0.   ,
         3.   , 13.   , 16.   ],
       [ 1.   ,  0.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,  0.   ,
         8.   , 32.   , 40.   ],
       [ 1.   ,  0.   ,  0.   ,  1.   ,  9.02 , 13.635, 80.   ,  0.   ,
         5.   , 27.   , 32.   ],
       [ 1.   ,  0.   ,  0.   ,  1.   ,  9.84 ,  0.   , 75.   ,  0.   ,
         3.   , 10.   , 13.   ],
       [ 1.   ,  0.   ,  0.   ,  1.   ,  9.84 , 14.395, 75.   ,  0.   ,
         0.   ,  1.   ,  1.   ]])

In [3]:
kwargs = dict(delimiter=",",
                  dtype=float,
                  skip_header=1,
                  usecols=np.arange(1,12),
                  missing_values={np.nan},
                  filling_values=0
             )

data = np.genfromtxt('datasets/bike_sharing.csv', **kwargs)
data[:5]

array([[ 1.   ,  0.   ,  0.   ,  1.   ,  9.84 , 14.395, 81.   ,  0.   ,
         3.   , 13.   , 16.   ],
       [ 1.   ,  0.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,  0.   ,
         8.   , 32.   , 40.   ],
       [ 1.   ,  0.   ,  0.   ,  1.   ,  9.02 , 13.635, 80.   ,  0.   ,
         5.   , 27.   , 32.   ],
       [ 1.   ,  0.   ,  0.   ,  1.   ,  9.84 ,  0.   , 75.   ,  0.   ,
         3.   , 10.   , 13.   ],
       [ 1.   ,  0.   ,  0.   ,  1.   ,  9.84 , 14.395, 75.   ,  0.   ,
         0.   ,  1.   ,  1.   ]])

In [10]:
import pandas as pd

df = pd.read_csv('datasets/bike_sharing.csv', parse_dates=["datetime"])

df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81.0,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,,,,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80.0,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,,75.0,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75.0,0.0,0,1,1


### Pickle files

In [17]:
import pickle

# initializing data to be stored in db
John = {'key' : '1', 'name' : 'John Simon', 'age' : 21, 'weight(kg)' : 60}
Samuel = {'key' : '2', 'name' : 'Samuel Peter', 'age' : 31, 'weight(kg)' : 65}


# database
db = {}
db['John'] = John
db['Samuel'] = Samuel

# Its important to use binary mode
dbfile = open('pickle_example.pkl', 'ab')

# source, destination
pickle.dump(db, dbfile)                     
dbfile.close()

In [22]:
# for reading also binary mode is important
pkl_file = open('datasets/pickle_example.pkl', 'rb')     
data = pickle.load(pkl_file)
for keys in data:
    print(keys, '=>', data[keys])
pkl_file.close()

John => {'key': '1', 'name': 'John Simon', 'age': 21, 'weight(kg)': 60}
Samuel => {'key': '2', 'name': 'Samuel Peter', 'age': 31, 'weight(kg)': 65}


### Excel Sheets

In [32]:
data = pd.ExcelFile('datasets/iris.xlsx')
#print(data.sheet_names)
df1 = data.parse(0)
print(df1.head())

   Unnamed: 0         Unnamed: 1        Unnamed: 2         Unnamed: 3  \
0         NaN  Sepal Length (cm)  Sepal Width (cm)  Petal Length (cm)   
1         NaN                  7               3.2                4.7   
2         NaN                6.4               3.2                4.5   
3         NaN                6.9               3.1                4.9   
4         NaN                5.5               2.3                  4   

         Unnamed: 4       Unnamed: 5  Unnamed: 6  Unnamed: 7 Unnamed: 8  \
0  Petal Width (cm)            Class         NaN         NaN      alpha   
1               1.4  Iris-versicolor         NaN         0.0          0   
2               1.5  Iris-versicolor         NaN         0.0        NaN   
3               1.5  Iris-versicolor         NaN         0.0        NaN   
4               1.3  Iris-versicolor         NaN         0.0        NaN   

  Unnamed: 9  Unnamed: 10  Unnamed: 11  
0        obj          NaN          NaN  
1          0          0.0   

In [43]:
import pandas as pd
from sas7bdat import SAS7BDAT
with SAS7BDAT('datasets/airline.sas7bdat') as file:
    df_sas = file.to_data_frame()
df_sas.head()

Unnamed: 0,YEAR,Y,W,R,L,K
0,1948.0,1.214,0.243,0.1454,1.415,0.612
1,1949.0,1.354,0.26,0.2181,1.384,0.559
2,1950.0,1.569,0.278,0.3157,1.388,0.573
3,1951.0,1.948,0.297,0.394,1.55,0.564
4,1952.0,2.265,0.31,0.3559,1.802,0.574


### Stata files

In [44]:
data = pd.read_stata('datasets/alcohol.dta')
data.head()

Unnamed: 0,adults,kids,income,consume
0,2,2,758,1
1,2,3,1785,1
2,3,0,1200,1
3,1,0,545,1
4,4,1,547,1


### hdf5 files

In [48]:
import h5py
filename = 'datasets/NEONDSTowerTemperatureData.hdf5'
data = h5py.File(filename, 'r') # 'r' is to read
print(type(data))

<class 'h5py._hl.files.File'>


### SQLITE

In [56]:
from sqlalchemy import create_engine
import pandas as pd
engine = create_engine('sqlite:///datasets/simplefolks.sqlite')
con = engine.connect()
rs = con.execute("SELECT * FROM people")
df = pd.DataFrame(rs.fetchall())
con.close()

df.head()

Unnamed: 0,0,1,2
0,Austin,M,33
1,Blair,M,90
2,Carolina,F,28
3,Dani,F,41
4,Donald,M,70


### Web Scraping

In [64]:
from bs4 import BeautifulSoup
import requests
url = 'https://www.crummy.com/software/BeautifulSoup/'
r = requests.get(url)
html_doc = r.text
soup = BeautifulSoup(html_doc)

for link in soup.find_all('a'):
    print(link.get('href'))

#Download
bs4/doc/
#HallOfFame
enterprise.html
https://code.launchpad.net/beautifulsoup
https://bazaar.launchpad.net/%7Eleonardr/beautifulsoup/bs4/view/head:/CHANGELOG
https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup
zine/
bs4/download/
http://lxml.de/
http://code.google.com/p/html5lib/
bs4/doc/
https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&utm_medium=referral&utm_campaign=enterprise
https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup
https://bugs.launchpad.net/beautifulsoup/
https://tidelift.com/security
https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&utm_medium=referral&utm_campaign=website
zine/
None
bs4/download/
http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html
download/3.x/BeautifulSoup-3.2.2.tar.gz
https://tidelift.com/subscription/pkg/pypi-beautifulsoup?utm_source=pypi-beautifulsoup&utm_medium=referral&utm_campaign=website
None
http://www.nytimes.co