In [1]:
import pandas as pd
import numpy as np

In [2]:
#defining a DataFrame
data = {'A': [1,2,3,np.nan,5,6,7],
       'B' : ['a','b',np.nan, 'd', np.nan, 'f','g']}
df = pd.DataFrame(data)

### Some basic data cleaning 

In [3]:
df

Unnamed: 0,A,B
0,1.0,a
1,2.0,b
2,3.0,
3,,d
4,5.0,
5,6.0,f
6,7.0,g


In [4]:
#Checking for null values in the data
df.isnull()

Unnamed: 0,A,B
0,False,False
1,False,False
2,False,True
3,True,False
4,False,True
5,False,False
6,False,False


In [5]:
#dropping rows with null values
df.dropna()

Unnamed: 0,A,B
0,1.0,a
1,2.0,b
5,6.0,f
6,7.0,g


In [6]:
#dropping null values where the entire row is null. Here since no such rows exist, the data does not change
df.dropna(how='all')

Unnamed: 0,A,B
0,1.0,a
1,2.0,b
2,3.0,
3,,d
4,5.0,
5,6.0,f
6,7.0,g


In [7]:
#filling null cells with a certain values. 
df.fillna(0)

Unnamed: 0,A,B
0,1.0,a
1,2.0,b
2,3.0,0
3,0.0,d
4,5.0,0
5,6.0,f
6,7.0,g


In [8]:
#Replacing values in a dataframe
df.replace(1, 100)

Unnamed: 0,A,B
0,100.0,a
1,2.0,b
2,3.0,
3,,d
4,5.0,
5,6.0,f
6,7.0,g


### Aggregating Function

In [9]:
#Reading data from a csv file
happiness_2015 = pd.read_csv('2015.csv')
happiness_2015.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [10]:
happiness_2015.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        158 non-null    object 
 1   Region                         158 non-null    object 
 2   Happiness Rank                 158 non-null    int64  
 3   Happiness Score                158 non-null    float64
 4   Standard Error                 158 non-null    float64
 5   Economy (GDP per Capita)       158 non-null    float64
 6   Family                         158 non-null    float64
 7   Health (Life Expectancy)       158 non-null    float64
 8   Freedom                        158 non-null    float64
 9   Trust (Government Corruption)  158 non-null    float64
 10  Generosity                     158 non-null    float64
 11  Dystopia Residual              158 non-null    float64
dtypes: float64(9), int64(1), object(2)
memory usage: 1

##### Renaming columns in a `DataFrame`

In [11]:
happiness_2015 = happiness_2015.rename(columns = {'Economy (GDP per Capita)': "Economy",
                                                 'Health (Life Expectancy)': 'Health',
                                                 'Trust (Government Corruption)': 'Trust'})

In [12]:
happiness_2015.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy,Family,Health,Freedom,Trust,Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [13]:
grouped = happiness_2015.groupby('Region')

In [14]:
print(grouped)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001B13D532F10>


In [15]:
#Counting the number of data in a single group
print(grouped.size())

Region
Australia and New Zealand           2
Central and Eastern Europe         29
Eastern Asia                        6
Latin America and Caribbean        22
Middle East and Northern Africa    20
North America                       2
Southeastern Asia                   9
Southern Asia                       7
Sub-Saharan Africa                 40
Western Europe                     21
dtype: int64


In [16]:
print(grouped.mean().head())

                                 Happiness Rank  Happiness Score  \
Region                                                             
Australia and New Zealand              9.500000         7.285000   
Central and Eastern Europe            79.000000         5.332931   
Eastern Asia                          64.500000         5.626167   
Latin America and Caribbean           46.909091         6.144682   
Middle East and Northern Africa       77.600000         5.406900   

                                 Standard Error   Economy    Family    Health  \
Region                                                                          
Australia and New Zealand              0.037270  1.291880  1.314450  0.919965   
Central and Eastern Europe             0.045208  0.942438  1.053042  0.718774   
Eastern Asia                           0.037225  1.151780  1.099427  0.877388   
Latin America and Caribbean            0.061127  0.876815  1.104720  0.703870   
Middle East and Northern Africa      

In [17]:
#calculating aggregating value for a single column
print(grouped['Happiness Score'].mean())

Region
Australia and New Zealand          7.285000
Central and Eastern Europe         5.332931
Eastern Asia                       5.626167
Latin America and Caribbean        6.144682
Middle East and Northern Africa    5.406900
North America                      7.273000
Southeastern Asia                  5.317444
Southern Asia                      4.580857
Sub-Saharan Africa                 4.202800
Western Europe                     6.689619
Name: Happiness Score, dtype: float64


In [18]:
#calculating aggregating values for group of columns
print(grouped[['Happiness Score','Family','Freedom']].mean())

                                 Happiness Score    Family   Freedom
Region                                                              
Australia and New Zealand               7.285000  1.314450  0.645310
Central and Eastern Europe              5.332931  1.053042  0.358269
Eastern Asia                            5.626167  1.099427  0.462490
Latin America and Caribbean             6.144682  1.104720  0.501740
Middle East and Northern Africa         5.406900  0.920490  0.361751
North America                           7.273000  1.284860  0.589505
Southeastern Asia                       5.317444  0.940468  0.557104
Southern Asia                           4.580857  0.645321  0.373337
Sub-Saharan Africa                      4.202800  0.809085  0.365944
Western Europe                          6.689619  1.247302  0.549926


In [19]:
#resetting index for the aggregated dataframe
print(grouped[['Happiness Score','Family','Freedom']].mean().reset_index())

                            Region  Happiness Score    Family   Freedom
0        Australia and New Zealand         7.285000  1.314450  0.645310
1       Central and Eastern Europe         5.332931  1.053042  0.358269
2                     Eastern Asia         5.626167  1.099427  0.462490
3      Latin America and Caribbean         6.144682  1.104720  0.501740
4  Middle East and Northern Africa         5.406900  0.920490  0.361751
5                    North America         7.273000  1.284860  0.589505
6                Southeastern Asia         5.317444  0.940468  0.557104
7                    Southern Asia         4.580857  0.645321  0.373337
8               Sub-Saharan Africa         4.202800  0.809085  0.365944
9                   Western Europe         6.689619  1.247302  0.549926


In [20]:
print(grouped['Happiness Score'].agg([np.min, np.max, np.mean]))

                                  amin   amax      mean
Region                                                 
Australia and New Zealand        7.284  7.286  7.285000
Central and Eastern Europe       4.218  6.505  5.332931
Eastern Asia                     4.874  6.298  5.626167
Latin America and Caribbean      4.518  7.226  6.144682
Middle East and Northern Africa  3.006  7.278  5.406900
North America                    7.119  7.427  7.273000
Southeastern Asia                3.819  6.798  5.317444
Southern Asia                    3.575  5.253  4.580857
Sub-Saharan Africa               2.839  5.477  4.202800
Western Europe                   4.857  7.587  6.689619


### Combining Data 

In [21]:
#defining a custom function
def impact(element):
    if element > 1:
        return 'High'
    else:
        return 'Low'

In [22]:
#applying the custom function row wise in the DataFrame
happiness_2015['Family_map'] = happiness_2015['Family'].map(impact)
happiness_2015['Family_apply'] = happiness_2015['Family'].apply(impact)

In [23]:
happiness_2015.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy,Family,Health,Freedom,Trust,Generosity,Dystopia Residual,Family_map,Family_apply
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738,High,High
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201,High,High
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204,High,High
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531,High,High
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176,High,High


In [24]:
#applying custom function to a group of columns
factors = ['Economy', 'Family', 'Health', 'Freedom', 'Trust', 'Generosity']
happiness_rank = happiness_2015[factors].applymap(impact)
happiness_rank.head()

Unnamed: 0,Economy,Family,Health,Freedom,Trust,Generosity
0,High,High,Low,Low,Low,Low
1,High,High,Low,Low,Low,Low
2,High,High,Low,Low,Low,Low
3,High,High,Low,Low,Low,Low
4,High,High,Low,Low,Low,Low


In [25]:
def impact_new(element, x):
    if element > x:
        return 'High'
    else:
        return 'Low'

Difference between `map()` and `apply()` is that `apply()` can also take a parameter along with a function unlike `map()`

In [26]:
happiness_new = happiness_2015['Family'].map(impact_new, x=0.8)

TypeError: map() got an unexpected keyword argument 'x'

In [27]:
happiness_new = happiness_2015['Family'].apply(impact_new, x=0.8)
happiness_new.head()

0    High
1    High
2    High
3    High
4    High
Name: Family, dtype: object