In [1]:
import pandas as pd
import numpy as np


starting_date = '20160701'
sample_numpy_data = np.array(np.arange(24)).reshape((6,4))
dates_index = pd.date_range(starting_date, periods=6)
sample_df = pd.DataFrame(sample_numpy_data, index=dates_index, columns=list('ABCD'))

sample_df_2 = sample_df.copy()
sample_df_2['Fruits'] = ['apple', 'orange','banana','strawberry','blueberry','pineapple']

sample_series = pd.Series([1,2,3,4,5,6], index=pd.date_range(starting_date, periods=6))
sample_df_2['Extra Data'] = sample_series *3 +1

second_numpy_array = np.array(np.arange(len(sample_df_2)))  *100 + 7
sample_df_2['G'] = second_numpy_array

sample_df_2

Unnamed: 0,A,B,C,D,Fruits,Extra Data,G
2016-07-01,0,1,2,3,apple,4,7
2016-07-02,4,5,6,7,orange,7,107
2016-07-03,8,9,10,11,banana,10,207
2016-07-04,12,13,14,15,strawberry,13,307
2016-07-05,16,17,18,19,blueberry,16,407
2016-07-06,20,21,22,23,pineapple,19,507


### Missing Data
pandas uses np.nan to represent missing data. By default, it is not included in computations.
##### reindex()

In [2]:
browser_index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']

browser_df = pd.DataFrame({
      'http_status': [200,200,404,404,301],
      'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]},
       index=browser_index)
browser_df

Unnamed: 0,http_status,response_time
Firefox,200,0.04
Chrome,200,0.02
Safari,404,0.07
IE10,404,0.08
Konqueror,301,1.0


##### reindex() creates a copy (not a view)

In [3]:
new_index= ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10', 'Chrome']
browser_df_2 = browser_df.reindex(new_index)
browser_df_2

Unnamed: 0,http_status,response_time
Safari,404.0,0.07
Iceweasel,,
Comodo Dragon,,
IE10,404.0,0.08
Chrome,200.0,0.02


##### drop rows that have missing data

In [4]:
browser_df_3 = browser_df_2.dropna(how='any')
browser_df_3

Unnamed: 0,http_status,response_time
Safari,404.0,0.07
IE10,404.0,0.08
Chrome,200.0,0.02


##### fill-in missing data

In [5]:
browser_df_2.fillna(value=-0.05555)

Unnamed: 0,http_status,response_time
Safari,404.0,0.07
Iceweasel,-0.05555,-0.05555
Comodo Dragon,-0.05555,-0.05555
IE10,404.0,0.08
Chrome,200.0,0.02


##### get boolean mask where values are nan

In [6]:
pd.isnull(browser_df_2)

Unnamed: 0,http_status,response_time
Safari,False,False
Iceweasel,True,True
Comodo Dragon,True,True
IE10,False,False
Chrome,False,False


##### NaN propagates during arithmetic operations

In [7]:
browser_df_2 * 17

Unnamed: 0,http_status,response_time
Safari,6868.0,1.19
Iceweasel,,
Comodo Dragon,,
IE10,6868.0,1.36
Chrome,3400.0,0.34


# Operations

In [8]:
import pandas as pd
import numpy as np

starting_date = '20160701'
sample_numpy_data = np.array(np.arange(24)).reshape((6,4))
dates_index = pd.date_range(starting_date, periods=6)
sample_df = pd.DataFrame(sample_numpy_data, index=dates_index, columns=list('ABCD'))

sample_df_2 = sample_df.copy()
sample_df_2['Fruits'] = ['apple', 'orange','banana','strawberry','blueberry','pineapple']

sample_series = pd.Series([1,2,3,4,5,6], index=pd.date_range(starting_date, periods=6))
sample_df_2['Extra Data'] = sample_series *3 +1

second_numpy_array = np.array(np.arange(len(sample_df_2)))  *100 + 7
sample_df_2['G'] = second_numpy_array

sample_df_2

Unnamed: 0,A,B,C,D,Fruits,Extra Data,G
2016-07-01,0,1,2,3,apple,4,7
2016-07-02,4,5,6,7,orange,7,107
2016-07-03,8,9,10,11,banana,10,207
2016-07-04,12,13,14,15,strawberry,13,307
2016-07-05,16,17,18,19,blueberry,16,407
2016-07-06,20,21,22,23,pineapple,19,507


### descriptive statistics

In [9]:
pd.set_option('display.precision', 2)
sample_df_2.describe()

Unnamed: 0,A,B,C,D,Extra Data,G
count,6.0,6.0,6.0,6.0,6.0,6.0
mean,10.0,11.0,12.0,13.0,11.5,257.0
std,7.48,7.48,7.48,7.48,5.61,187.08
min,0.0,1.0,2.0,3.0,4.0,7.0
25%,5.0,6.0,7.0,8.0,7.75,132.0
50%,10.0,11.0,12.0,13.0,11.5,257.0
75%,15.0,16.0,17.0,18.0,15.25,382.0
max,20.0,21.0,22.0,23.0,19.0,507.0


##### column mean

In [10]:
sample_df_2.mean()

A              10.0
B              11.0
C              12.0
D              13.0
Extra Data     11.5
G             257.0
dtype: float64

##### row mean

In [11]:
sample_df_2.mean(1)

2016-07-01      2.83
2016-07-02     22.67
2016-07-03     42.50
2016-07-04     62.33
2016-07-05     82.17
2016-07-06    102.00
Freq: D, dtype: float64

### apply (a function to a data frame)

In [12]:
sample_df_2

Unnamed: 0,A,B,C,D,Fruits,Extra Data,G
2016-07-01,0,1,2,3,apple,4,7
2016-07-02,4,5,6,7,orange,7,107
2016-07-03,8,9,10,11,banana,10,207
2016-07-04,12,13,14,15,strawberry,13,307
2016-07-05,16,17,18,19,blueberry,16,407
2016-07-06,20,21,22,23,pineapple,19,507


In [13]:
sample_df_2.apply(np.cumsum, axis=0)

Unnamed: 0,A,B,C,D,Fruits,Extra Data,G
2016-07-01,0,1,2,3,apple,4,7
2016-07-02,4,6,8,10,appleorange,11,114
2016-07-03,12,15,18,21,appleorangebanana,21,321
2016-07-04,24,28,32,36,appleorangebananastrawberry,34,628
2016-07-05,40,45,50,55,appleorangebananastrawberryblueberry,50,1035
2016-07-06,60,66,72,78,appleorangebananastrawberryblueberrypineapple,69,1542


##### string methods

In [14]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

In [15]:
s.str.len()

0    1.0
1    1.0
2    1.0
3    4.0
4    4.0
5    NaN
6    4.0
7    3.0
8    3.0
dtype: float64