# Chapter 2

This notebook contains the commands that are shown in the lectures.

In [138]:
import pandas as pd
import numpy as np

In [143]:
atp_players = pd.read_csv('../data/atp_players.csv', names=['player_id', 'first_name', 'last_name', 'hand', 'birth_date', 'country_code'])
atp_players.head()

Unnamed: 0,player_id,first_name,last_name,hand,birth_date,country_code
0,100001,Gardnar,Mulloy,R,19131122.0,USA
1,100002,Pancho,Segura,R,19210620.0,ECU
2,100003,Frank,Sedgman,R,19271002.0,AUS
3,100004,Giuseppe,Merlo,R,19271011.0,ITA
4,100005,Richard Pancho,Gonzales,R,19280509.0,USA


In [144]:
print(atp_players.dtypes)

player_id         int64
first_name       object
last_name        object
hand             object
birth_date      float64
country_code     object
dtype: object


In [145]:
atp_players.head()

Unnamed: 0,player_id,first_name,last_name,hand,birth_date,country_code
0,100001,Gardnar,Mulloy,R,19131122.0,USA
1,100002,Pancho,Segura,R,19210620.0,ECU
2,100003,Frank,Sedgman,R,19271002.0,AUS
3,100004,Giuseppe,Merlo,R,19271011.0,ITA
4,100005,Richard Pancho,Gonzales,R,19280509.0,USA


In [141]:
atp_players['birth_date'] = pd.to_datetime(atp_players['birth_date'], format='%Y%m%d', errors='coerce')
print(atp_players.dtypes)

player_id                int64
first_name              object
last_name               object
hand                    object
birth_date      datetime64[ns]
country_code            object
dtype: object


In [126]:
pd.to_datetime(atp_players[['birth_date']], format='%Y%m%d', errors='ignore')

ValueError: to assemble mappings requires at least that [year, month, day] be specified: [day,month,year] is missing

In [123]:
atp_players[['birth_date']].apply(lambda x: pd.to_datetime(x, format='%Y%m%d', errors='ignore'))


Unnamed: 0,birth_date
0,19131122
1,19210620
2,19271002
3,19271011
4,19280509
...,...
54933,
54934,
54935,
54936,


In [111]:
atp_players['name'] = atp_players['last_name'] + ', ' + atp_players['first_name']

atp_players.head()

Unnamed: 0,player_id,first_name,last_name,hand,birth_date,country_code,name
0,100001,Gardnar,Mulloy,R,19131122.0,USA,"Mulloy, Gardnar"
1,100002,Pancho,Segura,R,19210620.0,ECU,"Segura, Pancho"
2,100003,Frank,Sedgman,R,19271002.0,AUS,"Sedgman, Frank"
3,100004,Giuseppe,Merlo,R,19271011.0,ITA,"Merlo, Giuseppe"
4,100005,Richard Pancho,Gonzales,R,19280509.0,USA,"Gonzales, Richard Pancho"


In [112]:
atp_players.drop(['first_name','last_name'], axis=1, inplace=True)
atp_players.dtypes

player_id         int64
hand             object
birth_date      float64
country_code     object
name             object
dtype: object

In [113]:
print(atp_players['hand'].nbytes)
atp_players[['country_code', 'hand']] = atp_players[['country_code', 'hand']].apply(lambda x: x.astype('category'))
print(atp_players['country_code'].nbytes)
print(atp_players['hand'].cat.categories)
atp_players.dtypes

439504
111556
Index(['A', 'L', 'R', 'U'], dtype='object')


player_id          int64
hand            category
birth_date       float64
country_code    category
name              object
dtype: object

In [84]:
atp_players.dtypes

player_id          int64
hand            category
birth_date       float64
country_code    category
name              object
dtype: object

In [2]:
iris = pd.read_csv('../data/iris.data', names=['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width', 'Species'])

In [37]:
print(iris.dtypes)

Sepal.Length    float64
Sepal.Width     float64
Petal.Length    float64
Petal.Width     float64
Species          object
dtype: object


In [41]:
efw = pd.read_excel('../data/efw.xlsx', skiprows=4, header=0, usecols=range(73))

In [42]:
efw.columns

Index(['Unnamed: 0', 'Year', 'ISO_Code', 'Countries',
       'Economic Freedom Summary Index', 'Rank', 'Quartile',
       'Government consumption', 'data', 'Transfers and subsidies', 'data.1',
       'Government investment', 'data.2', 'Top marginal income tax rate',
       'data.3', 'Top marginal income and payroll tax rate', 'data.4',
       'Top marginal tax rate', 'State ownership of assets',
       'Size of Government', 'Judicial independence', 'Impartial courts',
       'Protection of property rights',
       'Military interference in rule of law and politics',
       'Integrity of the legal system', 'Legal enforcement of contracts',
       'Regulatory restrictions on the sale of real property',
       'Reliability of police', 'Gender Legal Rights Adjustment',
       'Legal System & Property Rights', 'Money growth', 'data.5',
       'Standard deviation of inflation', 'data.6',
       'Inflation: Most recent year', 'data.7',
       ' Freedom to own foreign currency bank accounts', 

In [50]:
from sqlalchemy import create_engine

engine = create_engine('sqlite:///../data/iris.sqlite')

iris.to_sql('iris', engine, if_exists='replace')

pd.read_sql('iris', engine)

Unnamed: 0,index,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,0,5.1,3.5,1.4,0.2,Iris-setosa
1,1,4.9,3.0,1.4,0.2,Iris-setosa
2,2,4.7,3.2,1.3,0.2,Iris-setosa
3,3,4.6,3.1,1.5,0.2,Iris-setosa
4,4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,145,6.7,3.0,5.2,2.3,Iris-virginica
146,146,6.3,2.5,5.0,1.9,Iris-virginica
147,147,6.5,3.0,5.2,2.0,Iris-virginica
148,148,6.2,3.4,5.4,2.3,Iris-virginica


In [51]:
wdbc_data = pd.read_csv(
    '../data/wdbc.data',
    header=None,
    prefix='X'
)

In [62]:
iris.to_hdf('../data/datacollection_python.h5', '/iris', mode='w')
wdbc_data.to_hdf('../data/datacollection_python.h5', '/wdbc')

In [63]:
pd.read_hdf('../data/datacollection_python.h5', '/iris')

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [64]:
pd.read_hdf('../data/datacollection_python.h5', '/wdbc')

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X22,X23,X24,X25,X26,X27,X28,X29,X30,X31
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400
