[Reference](https://pub.towardsai.net/6-pandas-operations-you-should-not-miss-d531736c6574)

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


# Summarizing data

In [3]:
df.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB


# Concatenation

In [5]:
smallData = df.loc[[1,7,21,10], ['sex','age','fare','who','class']]
smallData

Unnamed: 0,sex,age,fare,who,class
1,female,38.0,71.2833,woman,First
7,male,2.0,21.075,child,Third
21,male,34.0,13.0,man,Second
10,female,4.0,16.7,child,Third


In [6]:
newData = pd.DataFrame({'sex':['female','male','male'],
                        'age': [25,49,35],
                        'fare':[89.22,70.653,30.666],
                        'who':['child', 'women', 'man'],
                        'class':['First','First','First']})

In [7]:
pd.concat([smallData, newData])

Unnamed: 0,sex,age,fare,who,class
1,female,38.0,71.2833,woman,First
7,male,2.0,21.075,child,Third
21,male,34.0,13.0,man,Second
10,female,4.0,16.7,child,Third
0,female,25.0,89.22,child,First
1,male,49.0,70.653,women,First
2,male,35.0,30.666,man,First


In [8]:
pd.concat([ newData,smallData], ignore_index=True)

Unnamed: 0,sex,age,fare,who,class
0,female,25.0,89.22,child,First
1,male,49.0,70.653,women,First
2,male,35.0,30.666,man,First
3,female,38.0,71.2833,woman,First
4,male,2.0,21.075,child,Third
5,male,34.0,13.0,man,Second
6,female,4.0,16.7,child,Third


In [9]:
pd.concat([ newData,smallData], axis=1)

Unnamed: 0,sex,age,fare,who,class,sex.1,age.1,fare.1,who.1,class.1
0,female,25.0,89.22,child,First,,,,,
1,male,49.0,70.653,women,First,female,38.0,71.2833,woman,First
2,male,35.0,30.666,man,First,,,,,
7,,,,,,male,2.0,21.075,child,Third
10,,,,,,female,4.0,16.7,child,Third
21,,,,,,male,34.0,13.0,man,Second


In [10]:
newData = pd.DataFrame({'fare':[89.22,70.653,30.666,100],
                        'who':['child', 'women', 'man', 'women'], 
                        'class':['First','First','First','Second'],
                        'adult_male': [True, False, True, False]})
newData

Unnamed: 0,fare,who,class,adult_male
0,89.22,child,First,True
1,70.653,women,First,False
2,30.666,man,First,True
3,100.0,women,Second,False


In [11]:
pd.concat([smallData, newData])

Unnamed: 0,sex,age,fare,who,class,adult_male
1,female,38.0,71.2833,woman,First,
7,male,2.0,21.075,child,Third,
21,male,34.0,13.0,man,Second,
10,female,4.0,16.7,child,Third,
0,,,89.22,child,First,True
1,,,70.653,women,First,False
2,,,30.666,man,First,True
3,,,100.0,women,Second,False


# Merge and Join

In [13]:
df1 = pd.DataFrame({'employee_name':['Tasha','Linda','Olliver','Jack'],
                  'department':['Engineering', 'Accounting', 'HR', 'HR']})
df2 = pd.DataFrame({'employee_name':['Linda', 'Tasha', 'Jack', 'Olliver'],
                    'salary':[35000, 20500, 90000, 68000]})

In [14]:
df3 = pd.merge(df1,df2)
df3

Unnamed: 0,employee_name,department,salary
0,Tasha,Engineering,20500
1,Linda,Accounting,35000
2,Olliver,HR,68000
3,Jack,HR,90000


In [15]:
df4 = pd.DataFrame({'department':['Engineering', 'Accounting','HR'],
                    'supervisor': ['Jonas', 'Martha', 'Martin']})
print('--------df4---------\n',df4)
print('-------merged--------')
pd.merge(df3, df4)

--------df4---------
     department supervisor
0  Engineering      Jonas
1   Accounting     Martha
2           HR     Martin
-------merged--------


Unnamed: 0,employee_name,department,salary,supervisor
0,Tasha,Engineering,20500,Jonas
1,Linda,Accounting,35000,Martha
2,Olliver,HR,68000,Martin
3,Jack,HR,90000,Martin


In [17]:
df5 = pd.DataFrame({'department':['Engineering','Engineering','Accounting','Accounting', 'HR', 'HR'],
                  'skills': ['Coding', 'Soft skills', 'Math', 'Excel', 'Organizing', 'Decision making']})
print('--------df5---------\n',df5)
print('\n-------merged--------')
pd.merge(df3, df5)

--------df5---------
     department           skills
0  Engineering           Coding
1  Engineering      Soft skills
2   Accounting             Math
3   Accounting            Excel
4           HR       Organizing
5           HR  Decision making

-------merged--------


Unnamed: 0,employee_name,department,salary,skills
0,Tasha,Engineering,20500,Coding
1,Tasha,Engineering,20500,Soft skills
2,Linda,Accounting,35000,Math
3,Linda,Accounting,35000,Excel
4,Olliver,HR,68000,Organizing
5,Olliver,HR,68000,Decision making
6,Jack,HR,90000,Organizing
7,Jack,HR,90000,Decision making


In [18]:
df2 = pd.DataFrame({'name':['Linda', 'Tasha', 'Jack', 'Olliver'],
                    'salary':[35000, 20500, 90000, 68000]})
print('--------df1---------\n',df1)
print('--------df2---------\n',df2)
print('\n-------merged--------')
pd.merge(df1, df2, left_on='employee_name', right_on='name')

--------df1---------
   employee_name   department
0         Tasha  Engineering
1         Linda   Accounting
2       Olliver           HR
3          Jack           HR
--------df2---------
       name  salary
0    Linda   35000
1    Tasha   20500
2     Jack   90000
3  Olliver   68000

-------merged--------


Unnamed: 0,employee_name,department,name,salary
0,Tasha,Engineering,Tasha,20500
1,Linda,Accounting,Linda,35000
2,Olliver,HR,Olliver,68000
3,Jack,HR,Jack,90000


In [20]:
df1 = pd.DataFrame({'employee_name':['Tasha','Linda','Olliver','Jack'], 
                  'department':['Engineering', 'Accounting', 'HR', 'HR']})
df2 = pd.DataFrame({'employee_name':['Linda', 'Mary'],
                    'salary':[35000, 20500]})
print('--------df1---------\n',df1)
print('--------df2---------\n',df2)
print('\n-------merged--------\n')
pd.merge(df1, df2)

--------df1---------
   employee_name   department
0         Tasha  Engineering
1         Linda   Accounting
2       Olliver           HR
3          Jack           HR
--------df2---------
   employee_name  salary
0         Linda   35000
1          Mary   20500

-------merged--------



Unnamed: 0,employee_name,department,salary
0,Linda,Accounting,35000


In [21]:
print('-------left join--------\n',pd.merge(df1, df2, how='left'))
print('\n-------right join--------\n',pd.merge(df1,df2,how='right'))

-------left join--------
   employee_name   department   salary
0         Tasha  Engineering      NaN
1         Linda   Accounting  35000.0
2       Olliver           HR      NaN
3          Jack           HR      NaN

-------right join--------
   employee_name  department  salary
0         Linda  Accounting   35000
1          Mary         NaN   20500


# GroupBy

In [22]:
print(df.groupby('sex'))
df.groupby('sex').sum()

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f77ed013350>


Unnamed: 0_level_0,survived,pclass,age,sibsp,parch,fare,adult_male,alone
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
female,233,678,7286.0,218,204,13966.6628,0,126
male,109,1379,13919.17,248,136,14727.2865,537,411


In [23]:
data = df.groupby('sex')['survived'].sum()
print('% of male survivers',(data['male']/(data['male']+data['female']))*100)
print('% of male female',(data['female']/(data['male']+data['female']))*100)

% of male survivers 31.871345029239766
% of male female 68.12865497076024


In [24]:
df.groupby('sex')['survived'].aggregate(['sum', np.mean,'median'])

Unnamed: 0_level_0,sum,mean,median
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,233,0.742038,1
male,109,0.188908,0


In [25]:
df.groupby('survived').filter(lambda x: x['fare'].std() > 50)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
875,1,3,female,15.0,0,0,7.2250,C,Third,child,False,,Cherbourg,yes,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
880,1,2,female,25.0,0,1,26.0000,S,Second,woman,False,,Southampton,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [26]:
df.groupby('survived').transform(lambda x: x - x.mean())

Unnamed: 0,pclass,age,sibsp,parch,fare,adult_male,alone
0,0.468124,-8.626179,0.446266,-0.329690,-14.867887,0.182149,-0.681239
1,-0.950292,9.656310,0.526316,-0.464912,22.887892,-0.257310,-0.476608
2,1.049708,-2.343690,-0.473684,-0.464912,-40.470408,-0.257310,0.523392
3,-0.950292,6.656310,0.526316,-0.464912,4.704592,-0.257310,-0.476608
4,0.468124,4.373821,-0.553734,-0.329690,-14.067887,0.182149,0.318761
...,...,...,...,...,...,...,...
886,-0.531876,-3.626179,-0.553734,-0.329690,-9.117887,0.182149,0.318761
887,-0.950292,-9.343690,-0.473684,-0.464912,-18.395408,-0.257310,0.523392
888,0.468124,,0.446266,1.670310,1.332113,-0.817851,-0.681239
889,-0.950292,-2.343690,-0.473684,-0.464912,-18.395408,0.742690,0.523392


In [28]:
def func(x):
    x['fare'] = x['fare'] / x['fare'].sum()
    return x
df.groupby('survived').apply(func)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,0.000597,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,0.004307,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,0.000479,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,0.003208,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,0.000663,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,0.001071,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,0.001813,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,0.001931,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,0.001813,C,First,man,True,C,Cherbourg,yes,True


In [33]:
df.groupby(['sex', 'pclass'])['survived'].aggregate('mean').unstack()

pclass,1,2,3
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [32]:
df.pivot_table('survived', index='sex', columns='pclass')

pclass,1,2,3
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [34]:
age = pd.cut(df['age'], [0, 18, 40, 80])
pivotTable = df.pivot_table('survived', ['sex', age], 'class')
pivotTable

Unnamed: 0_level_0,class,First,Second,Third
sex,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,"(0, 18]",0.909091,1.0,0.511628
female,"(18, 40]",0.979167,0.914894,0.48
female,"(40, 80]",0.961538,0.846154,0.111111
male,"(0, 18]",0.8,0.6,0.215686
male,"(18, 40]",0.478261,0.063492,0.146199
male,"(40, 80]",0.28,0.095238,0.064516


In [35]:
pivotTable = pivotTable.unstack()
pivotTable

class,First,First,First,Second,Second,Second,Third,Third,Third
age,"(0, 18]","(18, 40]","(40, 80]","(0, 18]","(18, 40]","(40, 80]","(0, 18]","(18, 40]","(40, 80]"
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
female,0.909091,0.979167,0.961538,1.0,0.914894,0.846154,0.511628,0.48,0.111111
male,0.8,0.478261,0.28,0.6,0.063492,0.095238,0.215686,0.146199,0.064516


In [37]:
pivotTable = pivotTable.unstack(level=0)
pivotTable

Unnamed: 0_level_0,class,First,Second,Third
age,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(0, 18]",female,0.909091,1.0,0.511628
"(0, 18]",male,0.8,0.6,0.215686
"(18, 40]",female,0.979167,0.914894,0.48
"(18, 40]",male,0.478261,0.063492,0.146199
"(40, 80]",female,0.961538,0.846154,0.111111
"(40, 80]",male,0.28,0.095238,0.064516


In [38]:
pivotTable = pivotTable.unstack(level=0)
pivotTable

class,First,First,First,Second,Second,Second,Third,Third,Third
age,"(0, 18]","(18, 40]","(40, 80]","(0, 18]","(18, 40]","(40, 80]","(0, 18]","(18, 40]","(40, 80]"
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
female,0.909091,0.979167,0.961538,1.0,0.914894,0.846154,0.511628,0.48,0.111111
male,0.8,0.478261,0.28,0.6,0.063492,0.095238,0.215686,0.146199,0.064516


In [39]:
pivotTable.stack()

Unnamed: 0_level_0,class,First,Second,Third
sex,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,"(0, 18]",0.909091,1.0,0.511628
female,"(18, 40]",0.979167,0.914894,0.48
female,"(40, 80]",0.961538,0.846154,0.111111
male,"(0, 18]",0.8,0.6,0.215686
male,"(18, 40]",0.478261,0.063492,0.146199
male,"(40, 80]",0.28,0.095238,0.064516
