# Series

In [40]:
import numpy as np


In [41]:
import pandas as pd


In [42]:
labels = ['a','b','c']
my_data = [10,20,30]
arr = np.array(my_data)
d = {'a':10, 'b':20, 'c':30}

In [43]:
pd.Series(data = my_data)

0    10
1    20
2    30
dtype: int64

In [44]:
pd.Series(data = my_data, index = labels)

a    10
b    20
c    30
dtype: int64

In [45]:
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [46]:
# Provides various data structures and operations for manipulating numerical
# data and time series. This library is built on top of the NumPy library. 
#  Advantages 
# Fast and efficient for manipulating and analyzing data.
# Data from different file objects can be loaded.
# Easy handling of missing data (represented as NaN) in floating point as well
#   as non-floating point data
# Size mutability: columns can be inserted and deleted from DataFrame and 
#   higher dimensional objects
# Data set merging and joining.
# Flexible reshaping and pivoting of data sets
# Provides time-series functionality.
# Powerful group by functionality for performing split-apply-combine operations 
#   on data sets.

In [47]:
arr

array([10, 20, 30])

In [48]:
pd.Series(arr)

0    10
1    20
2    30
dtype: int32

In [49]:
labels

['a', 'b', 'c']

In [50]:
pd.Series(data = labels)

0    a
1    b
2    c
dtype: object

In [51]:
pd.Series(data = [sum,print,len])

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object

In [52]:
ser1 = pd.Series([1,2,3,4],['СССР','Deutschland','USA','日本'])
ser1

СССР           1
Deutschland    2
USA            3
日本             4
dtype: int64

In [53]:
ser2 = pd.Series([1,2,5,4],['Italia','Deutschland','USA','日本'])
ser2

Italia         1
Deutschland    2
USA            5
日本             4
dtype: int64

In [54]:
ser1['СССР']

1

In [55]:
ser1 + ser2    # Integers here are going to be converted into floats

Deutschland    4.0
Italia         NaN
USA            8.0
СССР           NaN
日本             8.0
dtype: float64

In [56]:
# Pandas and Numpy will always convert stuff to float in order to retain all
# the information possible

# Data Frames (Part-1)
Looks like Excel

In [57]:
# Data Frame is just a bunch of series that share the same index

In [58]:
from numpy.random import randn  

In [59]:
np.random.seed(101)    # Seed function is used to save the state of a random 
# function, so that it can generate same random numbers on multiple executions
# of the code on the same machine or on different machines (for a specific seed
# value). The seed value is the previous value number generated by the 
# generator. For the first time when there is no previous value, it uses 
# current system time.

In [60]:
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])   # 4 columns & 5 rows

In [61]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [62]:
df['W']   # Grabbing a sequel
          # How to select a column

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [63]:
type(df)

pandas.core.frame.DataFrame

In [64]:
type(df['W'])

pandas.core.series.Series

In [65]:
df[['W','Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [66]:
df['new'] = df['W'] + df['Y']

In [67]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [68]:
df.drop('new')    # It says 'new' not found on the axis

KeyError: "['new'] not found in axis"

In [70]:
df.drop('new', axis = 1)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [71]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [72]:
df.drop('new', axis = 1, inplace = True)
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [73]:
df.drop('E')    # You don't need to define axis = 0

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [74]:
# Axis in ROWS are 0 (row wsie operation)
# Axis in COLUMNS are 1 (column wise operation)
# By default, axis = 0
# Reason goes back to Numpy; 
# As shown below, for 5 there are A,B,C,D,E and that will have index 0
# and for 4 there are W,X,Y,Z and that will have index 1

In [75]:
df.shape

(5, 4)

In [76]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [77]:
# HOW TO SELELCT A ROW
df.loc['C']

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [78]:
df.iloc[2]

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [79]:
df.loc['B','Y']

-0.8480769834036315

In [80]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [81]:
df.loc[['A','B'],['W','Y']]

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077


# Data Frames (Part-2)
Conditional selection

In [82]:
booldf = df > 0

In [83]:
booldf

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [84]:
df[booldf]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [85]:
df[df > 0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [86]:
df['W'] > 0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [87]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [88]:
df['W'] > 0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [89]:
df[df['W']>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [90]:
df[df['Z']<0]

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001


In [91]:
df[df['W']>0]['X']

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [92]:
resultdf = df[df['W']>0]

In [93]:
resultdf['X']

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [94]:
df[df['W']>0][['Y','X']]

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


In [95]:
boolser = df['W']>0
boolser

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [96]:
result = df[boolser]
result                   # C isn't being used as it is false

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [97]:
mycols = ['Y','X']
result[mycols]

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757



Multiple conditions


In [98]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [99]:
# and operator in python can only deal with single boolean values
# so, we cannot compare series which has multiple values
df[(df['W']>0) and (df['Y']>1)]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [100]:
True and False

False

In [101]:
df[(df['W']>0) & (df['Y']>1)]     # use '&' here instead of 'and' 

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


In [102]:
df[(df['W']>0) or (df['Y']>1)] 

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [103]:
df[(df['W']>0) | (df['Y']>1)]     # use '|' instead is 'or'

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [104]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [105]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [106]:
df.reset_index(inplace=True)

In [107]:
df

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [118]:
newind = 'CA NY WY OR CO'.split()

In [109]:
'CA NY WY OR CO'.split()

['CA', 'NY', 'WY', 'OR', 'CO']

In [110]:
df['States'] = newind

In [111]:
df

Unnamed: 0,index,W,X,Y,Z,States
0,A,2.70685,0.628133,0.907969,0.503826,CA
1,B,0.651118,-0.319318,-0.848077,0.605965,NY
2,C,-2.018168,0.740122,0.528813,-0.589001,WY
3,D,0.188695,-0.758872,-0.933237,0.955057,OR
4,E,0.190794,1.978757,2.605967,0.683509,CO


In [112]:
df.set_index('States')    # To make this permanent, one needs to add  
                          # 'inplace = True'

Unnamed: 0_level_0,index,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CA,A,2.70685,0.628133,0.907969,0.503826
NY,B,0.651118,-0.319318,-0.848077,0.605965
WY,C,-2.018168,0.740122,0.528813,-0.589001
OR,D,0.188695,-0.758872,-0.933237,0.955057
CO,E,0.190794,1.978757,2.605967,0.683509


In [113]:
df

Unnamed: 0,index,W,X,Y,Z,States
0,A,2.70685,0.628133,0.907969,0.503826,CA
1,B,0.651118,-0.319318,-0.848077,0.605965,NY
2,C,-2.018168,0.740122,0.528813,-0.589001,WY
3,D,0.188695,-0.758872,-0.933237,0.955057,OR
4,E,0.190794,1.978757,2.605967,0.683509,CO


# Data Frames (Part-3)
Multi-index & index hierarchy

In [114]:
import numpy as np
import pandas as pd

In [115]:
# Index levels 
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [119]:
hier_index     # converts list to multi-index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [117]:
list(zip(outside,inside))    # Makes them tuples pairs

[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]

In [122]:
df2 = pd.DataFrame(randn(6,2),hier_index,['A', 'B'])

In [123]:
# Data from a multi level index
df2      

Unnamed: 0,Unnamed: 1,A,B
G1,1,1.025984,-0.156598
G1,2,-0.031579,0.649826
G1,3,2.154846,-0.610259
G2,1,-0.755325,-0.346419
G2,2,0.147027,-0.479448
G2,3,0.558769,1.02481


In [125]:
df2.loc['G1']

Unnamed: 0,A,B
1,1.025984,-0.156598
2,-0.031579,0.649826
3,2.154846,-0.610259


In [126]:
df2.loc['G1'].loc[1]

A    1.025984
B   -0.156598
Name: 1, dtype: float64

In [127]:
df2.index.names

FrozenList([None, None])

In [128]:
df2.index.names = ['Groups', 'Names']

In [129]:
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Names,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,1.025984,-0.156598
G1,2,-0.031579,0.649826
G1,3,2.154846,-0.610259
G2,1,-0.755325,-0.346419
G2,2,0.147027,-0.479448
G2,3,0.558769,1.02481


In [133]:
df2.loc['G2'].loc[2].loc['B']      

-0.47944803904109595

In [134]:
df2.loc['G2'].loc[2]['B']

-0.47944803904109595

In [136]:
df2.xs('G1')

Unnamed: 0_level_0,A,B
Names,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.025984,-0.156598
2,-0.031579,0.649826
3,2.154846,-0.610259


In [138]:
df2.xs(1,level='Names')         # Here .xs method will be helpful rather than using .loc method

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,1.025984,-0.156598
G2,-0.755325,-0.346419
