In [None]:
import numpy as np
np.__version__

In [None]:
import pandas as pd
pd.__version__

## Pandas Series
### Constructing Series objects
> pd.Series(data, index=index)

In [None]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
print(data)
print(data.values)
print(data.index)

In [None]:
data = pd.Series(1, index =['a', 'b', 'c', 'd'])
print(data)
data = pd.Series([0.25, 0.5, 0.75, 1.0], index =['a', 'b', 'c', 'd'])
print(data)
print(pd.Series(data, index = ['c','a']))


> When creating pd Series, index array should be the <span style="color:Crimson">same lengths as data array </span>.
>
> *data = pd.Series([0.25, 0.5, 0.75, 1.0], index =['a', 'b'])*  would reture ERROR 
>
> However, we could set index array to indexing a created serise
> 

#### data can be a dictionary, in which index defaults to the sorted dictionary keys

In [None]:
print(pd.Series({2:'a', 1:'b', 3:'c'}))
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
print(area)
population_dict = {'California': 38332521,
'Texas': 26448193,
'New York': 19651127,
'Florida': 19552860,
'Illinois': 12882135}
population = pd.Series(population_dict)
print(population)

## Pandas DataFrame
### 
#### DataFrame as a generalized NumPy array

In [None]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
print(area)
population_dict = {'California': 38332521,
'Texas': 26448193,
'New York': 19651127,
'Florida': 19552860,
'Illinois': 12882135}
population = pd.Series(population_dict)
print(population)
states = pd.DataFrame({'population': population, 'area': area})
states

#### DataFrame as specialized dictionary

In [None]:
print(states['area'])
print(states['population'])

### Constructing DataFrame objects
#### From a single Series object.

In [None]:
pd.DataFrame(population, columns=['population'])

#### From a list of dicts

In [None]:
data = [{'a': i, 'b': 2 * i}
for i in range(3)]
pd.DataFrame(data)

#### From a two-dimensional NumPy array

In [None]:
pd.DataFrame(np.random.rand(3, 2),
columns=['foo', 'bar'],
index=['a', 'b', 'c'])

#### From a NumPy structured array

In [None]:
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')],)
print(A)
Data = pd.DataFrame(A)
print(Data)
pd.DataFrame(A,index=['a', 'b', 'c'])

### The Pandas Index Object

#### Data Selection in Series

In [None]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
print(data)
print(data['b'])

#### Data Selection in DataFrame

In [None]:
area = pd.Series({'California': 423967, 'Texas': 695662,
'New York': 141297, 'Florida': 170312,
'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127, 'Florida': 19552860,
'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

In [None]:
data['density'] = data['pop'] / data['area']
data

In [None]:
print(data.values)

In [None]:
print(data.values[0])
print(data.loc["California":"Florida"])
print(data.iloc[0:4])
data.iloc[:3, :2]

In [None]:
data.loc[:'Illinois', :'pop']

In [None]:
#Indexers: Series.ix and Series.ix and DataFrame.ix  are deprecated and removed in latest pandas 
#data.ix[:3, :'pop'] 

In [None]:
data.loc[data.density > 100, ['area', 'pop', 'density']]

In [None]:
data.loc[data.density > 100, ['area', 'pop', 'density']]

In [None]:
data.loc[data.density > 100, data.loc['New York'] > 140]

In [None]:
data

In [None]:
data['Florida':'Illinois']

In [None]:
data[1:3]

> take care, <span style="color:Crimson">only one row slicing</span> 
>, like data[1], is not correct 

## Operating on Data in Pandas
### UFuncs: Index Alignment

In [None]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
print(A + B)
print(A.add(B, fill_value=0))

In [None]:
rng = np.random.RandomState(42)
A = pd.DataFrame(rng.randint(0, 20, (2, 2)), columns=['A','B'])
B = pd.DataFrame(rng.randint(0, 10, (3, 3)), columns=list('BAC'))
print(A)
print(B)
print(A+B)
fill = A.stack().mean()
print(fill)
A.add(B, fill_value=fill)

### Ufuncs: Operations Between DataFrame and Series

In [None]:
rng = np.random.RandomState(42)
A = rng.randint(10, size=(3, 4))
print(A)
A-A[0]

In [None]:
df = pd.DataFrame(A, index=[list('ABC')], columns=list('QRST'))
print(df)
print(df - df.iloc[0])
print(df - df.iloc[0:2])
print(df.iloc[0])
print(df.loc['A'])
print(df - df.loc['A'])

> * df.iloc[n] iPhython native one row slicing support broadcasting rules
>
> * df.iloc[ ] multiple rows slicing and df.loc[ ] <span style="color:Crimson">DO NOT </span> support broadcast, they operate according to  missing data rules

In [None]:
print(df.subtract(df['R'], axis=0))
print(df - df.iloc[:,0:2])

In [None]:
for dtype in ['object', 'int']:
    print("dtype =", dtype)
    %timeit np.arange(1E6, dtype=dtype).sum()

In [None]:
vals1 = np.array([1, None, 3, 4])
print(vals1)
vals2 = np.array([1, np.nan, 3, 4])
print(vals2)
print(vals2.dtype)
print(vals2.sum(), vals2.min(), vals2.max())
print(np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2))

In [None]:
df = pd.DataFrame([[1, np.nan, 2],
                   [2, 3,      5],
                   [np.nan, 4,6]])
df

In [None]:
print(df.dropna())
print()
print(df.dropna(axis="columns"))
df[3] = np.nan
print()
print(df)
print()
print(df.dropna(axis='columns', how='all'))
print()
print(df.dropna(axis='columns', thresh=3))
print()
print(df.fillna(0))
print()
print(df.fillna(method='ffill', axis=0))

## Hierarchical Indexing
### Methods of MultiIndex Creation

In [None]:
df = pd.DataFrame(np.random.rand(4, 2), index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]], 
                  columns=['data1', 'data2'])
print(df.columns)
df

In [None]:
data = {('California', 2000): 33871648,
        ('California', 2010): 37253956,
        ('Texas', 2000): 20851820,
        ('Texas', 2010): 25145561,
        ('New York', 2000): 18976457,
        ('New York', 2010): 19378102}
Data = pd.Series(data)
Data

#### MultiIndex level names

In [None]:
Data.index.names = ['state', 'year']
Data

#### Explicit MultiIndex constructors

In [None]:
print(pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]], 
                                names=['Alpha', 'Num']))
print()
print(pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)],
                               names=['Alpha', 'Num']))
print()
print(pd.MultiIndex.from_product([['a', 'b'], [1, 2]]))
print()
print(pd.MultiIndex(levels=[['a', 'b'], [1, 2]], 
              codes=[[0, 0, 1, 1], [0, 1, 0, 1]]))

#### Pandas MultiIndex

In [None]:
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,18976457, 19378102,20851820, 25145561]
index = pd.MultiIndex.from_tuples(index)
pop = pd.Series(populations, index=index)
print(pop)
pop_df = pop.unstack()
print()
print(pop_df)
pop_df = pd.DataFrame({'total': pop,
                       'under18': [9267089, 9284094,4687374, 4318033,5906301, 6879014]})
print()
print(pop_df)
f_u18 = pop_df['under18'] / pop_df['total']
f_u18.unstack()

### Indexing and Slicing a MultiIndex
#### Multiply indexed DataFrames slicing

In [None]:
index = pd.MultiIndex.from_product([[2013, 2014], ['a', 'b']],names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                     names=['subject', 'type'])
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

In [None]:
print(health_data['Guido', 'HR'])
print()
print(health_data.iloc[:2, :3])
print()
idx = pd.IndexSlice
health_data.loc[idx[:, 'b'], idx[:, 'HR']]


> <span style="color:Crimson">pd.IndexSlice</span> is a good way for multiple indexed DataFrame slicing

### Data Aggregations on Multi-Indices

In [None]:
def make_df(cols, ind):
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind]
    for c in cols}
    return pd.DataFrame(data, ind)

In [None]:
health_data

In [None]:
data_mean = health_data.mean(level='year')
print(data_mean)
print()
print(data_mean.mean(axis=1, level='type'))
print()
print(data_mean.mean(axis=1))

## Combining Datasets: Concat and Append

In [None]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

In [None]:
df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
print(df1); print(df2); print(pd.concat([df1, df2]))
df3 = make_df('AB', [0, 1])
df4 = make_df('CD', [0, 1])
print(df3); print(df4); print(pd.concat([df3, df4], axis="columns"))
print(pd.concat([df3, df4]))
print()
print(pd.concat([df3, df4], axis="columns", keys=['df3', 'df24']))
print()
print(pd.concat([df1, df2], keys=['df1', 'df2']))

## Combining Datasets: Merge and Join

In [None]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]})
print(df1); print(df2)
df3 = pd.merge(df1, df2)
df3

In [None]:
df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'], 
                    'salary': [70000, 80000, 120000, 90000]})
print(df1); print(); print(df3);print();
print(pd.merge(df1,df3, left_on="employee", right_on="name"))

In [None]:
print(df1.index);print();
df1a = df1.set_index('employee')
df2a = df2.set_index('employee')
print(df1a); print(); print(df2a)

In [None]:
print(pd.merge(df1, df2));print();
print(pd.merge(df1a, df2a, left_on="employee", right_on="employee")); print();
print(pd.merge(df1a, df2a, left_index=True, right_index=True));print();
print(df1a.join(df2a));print()

In [None]:
df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'], 
                    'salary': [70000, 80000, 120000, 90000]})
df3a = df3.set_index('name')
print(df3a);print();
print(pd.merge(df1a, df3, left_index=True, right_on='name'));print()
print(pd.merge(df1, df3a, left_on='employee', right_on='name'))

In [None]:
# Following are shell commands to download the data
# !curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv
# !curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-areas.csv
# !curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-abbrevs.csv

In [None]:
pop = pd.read_csv('state-population.csv')
areas = pd.read_csv('state-areas.csv')
abbrevs = pd.read_csv('state-abbrevs.csv')
print(pop.head()); print();print(areas.head());  print();print(abbrevs.head())

## Aggregation and Grouping

In [61]:
import seaborn as sns
# planets = sns.load_dataset('planets')
planets = pd.read_csv('planets.csv')
print(planets.shape)
planets.head(10)

(1035, 6)


Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009
5,Radial Velocity,1,185.84,4.8,76.39,2008
6,Radial Velocity,1,1773.4,4.64,18.15,2002
7,Radial Velocity,1,798.5,,21.41,1996
8,Radial Velocity,1,993.3,10.3,73.1,2008
9,Radial Velocity,2,452.8,1.99,74.79,2010


In [59]:
planets.isna()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
...,...,...,...,...,...,...
1030,False,False,False,True,False,False
1031,False,False,False,True,False,False
1032,False,False,False,True,False,False
1033,False,False,False,True,False,False


In [None]:
planets.dropna(axis=0)

In [None]:
planets.dropna().describe()