### Data Manipulation and Analysis with Pandas

Data manipulation and analysis are key tasks in any data science or data analysis project. Pandas provides a
wide range of functions for data manipulation and analysis, making it easier to clean, transform, and extract
insights from data. In this lesson, we will cover various data manipulation and analysis techniques using Pandas.

In [3]:
import pandas as pd

In [15]:
df=pd.read_csv('final_dataset.csv')
## Fetching first 5 rows
df.head()

Unnamed: 0,Date,Month,Year,Holidays_Count,Days,PM2.5,PM10,NO2,SO2,CO,Ozone,AQI
0,1,1,2021,0,5,408.8,442.42,160.61,12.95,2.77,43.19,462
1,2,1,2021,0,6,404.04,561.95,52.85,5.18,2.6,16.43,482
2,3,1,2021,1,7,225.07,239.04,170.95,10.93,1.4,44.29,263
3,4,1,2021,0,1,,132.08,153.98,10.42,1.01,49.19,207
4,5,1,2021,0,2,54.06,55.54,122.66,9.7,0.64,48.88,149


In [6]:
df.tail()

Unnamed: 0,Date,Month,Year,Holidays_Count,Days,PM2.5,PM10,NO2,SO2,CO,Ozone,AQI
1456,27,12,2024,0,5,58.43,249.17,41.69,65.89,0.99,36.25,263
1457,28,12,2024,0,6,33.83,150.77,33.31,66.14,0.79,35.19,113
1458,29,12,2024,1,7,31.21,139.75,27.01,65.94,0.57,35.88,142
1459,30,12,2024,0,1,38.01,152.83,29.12,65.16,0.55,38.38,116
1460,31,12,2024,0,2,80.42,318.96,40.37,64.98,0.84,39.93,209


In [7]:
df.describe()

Unnamed: 0,Date,Month,Year,Holidays_Count,Days,PM2.5,PM10,NO2,SO2,CO,Ozone,AQI
count,1461.0,1461.0,1461.0,1461.0,1461.0,1461.0,1461.0,1461.0,1461.0,1461.0,1461.0,1461.0
mean,15.729637,6.52293,2022.501027,0.189596,4.000684,90.774538,218.219261,37.184921,20.104921,1.025832,36.338871,202.210815
std,8.803105,3.449884,1.118723,0.392116,2.001883,71.650579,129.297734,35.225327,16.543659,0.608305,18.951204,107.801076
min,1.0,1.0,2021.0,0.0,1.0,0.05,9.69,2.16,1.21,0.27,2.7,19.0
25%,8.0,4.0,2022.0,0.0,2.0,41.28,115.11,17.28,7.71,0.61,24.1,108.0
50%,16.0,7.0,2023.0,0.0,4.0,72.06,199.8,30.49,15.43,0.85,32.47,189.0
75%,23.0,10.0,2024.0,0.0,6.0,118.5,297.75,45.01,26.62,1.24,45.73,284.0
max,31.0,12.0,2024.0,1.0,7.0,1000.0,1000.0,433.98,113.4,4.7,115.87,500.0


In [8]:
df.dtypes

Date                int64
Month               int64
Year                int64
Holidays_Count      int64
Days                int64
PM2.5             float64
PM10              float64
NO2               float64
SO2               float64
CO                float64
Ozone             float64
AQI                 int64
dtype: object

In [19]:
## handling missing values
df[df.isnull()].any(axis=1)

0       False
1       False
2       False
3       False
4       False
        ...  
1456    False
1457    False
1458    False
1459    False
1460    False
Length: 1461, dtype: bool

In [18]:
df.isnull().sum()

Date              0
Month             0
Year              0
Holidays_Count    0
Days              0
PM2.5             1
PM10              0
NO2               0
SO2               0
CO                0
Ozone             0
AQI               0
dtype: int64

In [21]:
filled_datafram=df.fillna(0)

In [23]:
## Filling missing values with the mean of the column
df['filled_PM2.5']=df['PM2.5'].fillna(df['PM2.5'].mean())
df

Unnamed: 0,Date,Month,Year,Holidays_Count,Days,PM2.5,PM10,NO2,SO2,CO,Ozone,AQI,filled_PM2.5
0,1,1,2021,0,5,408.80,442.42,160.61,12.95,2.77,43.19,462,408.800000
1,2,1,2021,0,6,404.04,561.95,52.85,5.18,2.60,16.43,482,404.040000
2,3,1,2021,1,7,225.07,239.04,170.95,10.93,1.40,44.29,263,225.070000
3,4,1,2021,0,1,,132.08,153.98,10.42,1.01,49.19,207,90.775377
4,5,1,2021,0,2,54.06,55.54,122.66,9.70,0.64,48.88,149,54.060000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,27,12,2024,0,5,58.43,249.17,41.69,65.89,0.99,36.25,263,58.430000
1457,28,12,2024,0,6,33.83,150.77,33.31,66.14,0.79,35.19,113,33.830000
1458,29,12,2024,1,7,31.21,139.75,27.01,65.94,0.57,35.88,142,31.210000
1459,30,12,2024,0,1,38.01,152.83,29.12,65.16,0.55,38.38,116,38.010000


In [24]:
df.dtypes

Date                int64
Month               int64
Year                int64
Holidays_Count      int64
Days                int64
PM2.5             float64
PM10              float64
NO2               float64
SO2               float64
CO                float64
Ozone             float64
AQI                 int64
filled_PM2.5      float64
dtype: object

In [26]:
## Renaming column
df=df.rename(columns={'Date':'Report_date'})
df.head()

Unnamed: 0,Report_date,Month,Year,Holidays_Count,Days,PM2.5,PM10,NO2,SO2,CO,Ozone,AQI,filled_PM2.5
0,1,1,2021,0,5,408.8,442.42,160.61,12.95,2.77,43.19,462,408.8
1,2,1,2021,0,6,404.04,561.95,52.85,5.18,2.6,16.43,482,404.04
2,3,1,2021,1,7,225.07,239.04,170.95,10.93,1.4,44.29,263,225.07
3,4,1,2021,0,1,,132.08,153.98,10.42,1.01,49.19,207,90.775377
4,5,1,2021,0,2,54.06,55.54,122.66,9.7,0.64,48.88,149,54.06


In [30]:
## Changing the datatype
df['AQI_new']=df['AQI'].astype(float)

In [32]:
df.head()

Unnamed: 0,Report_date,Month,Year,Holidays_Count,Days,PM2.5,PM10,NO2,SO2,CO,Ozone,AQI,filled_PM2.5,AQI_new
0,1,1,2021,0,5,408.8,442.42,160.61,12.95,2.77,43.19,462,408.8,462.0
1,2,1,2021,0,6,404.04,561.95,52.85,5.18,2.6,16.43,482,404.04,482.0
2,3,1,2021,1,7,225.07,239.04,170.95,10.93,1.4,44.29,263,225.07,263.0
3,4,1,2021,0,1,,132.08,153.98,10.42,1.01,49.19,207,90.775377,207.0
4,5,1,2021,0,2,54.06,55.54,122.66,9.7,0.64,48.88,149,54.06,149.0


In [33]:
df.dtypes

Report_date         int64
Month               int64
Year                int64
Holidays_Count      int64
Days                int64
PM2.5             float64
PM10              float64
NO2               float64
SO2               float64
CO                float64
Ozone             float64
AQI                 int64
filled_PM2.5      float64
AQI_new           float64
dtype: object

In [36]:
df['AQI_new']=df['AQI'].apply(lambda x:x*2)
df.head()

Unnamed: 0,Report_date,Month,Year,Holidays_Count,Days,PM2.5,PM10,NO2,SO2,CO,Ozone,AQI,filled_PM2.5,AQI_new
0,1,1,2021,0,5,408.8,442.42,160.61,12.95,2.77,43.19,462,408.8,924
1,2,1,2021,0,6,404.04,561.95,52.85,5.18,2.6,16.43,482,404.04,964
2,3,1,2021,1,7,225.07,239.04,170.95,10.93,1.4,44.29,263,225.07,526
3,4,1,2021,0,1,,132.08,153.98,10.42,1.01,49.19,207,90.775377,414
4,5,1,2021,0,2,54.06,55.54,122.66,9.7,0.64,48.88,149,54.06,298


In [37]:
## Data Aggregating And Frouping
df.head()

Unnamed: 0,Report_date,Month,Year,Holidays_Count,Days,PM2.5,PM10,NO2,SO2,CO,Ozone,AQI,filled_PM2.5,AQI_new
0,1,1,2021,0,5,408.8,442.42,160.61,12.95,2.77,43.19,462,408.8,924
1,2,1,2021,0,6,404.04,561.95,52.85,5.18,2.6,16.43,482,404.04,964
2,3,1,2021,1,7,225.07,239.04,170.95,10.93,1.4,44.29,263,225.07,526
3,4,1,2021,0,1,,132.08,153.98,10.42,1.01,49.19,207,90.775377,414
4,5,1,2021,0,2,54.06,55.54,122.66,9.7,0.64,48.88,149,54.06,298


In [38]:
grouped_mean=df.groupby('Report_date')['NO2'].mean()
print(grouped_mean)

Report_date
1     36.416458
2     33.323333
3     37.488958
4     36.022708
5     34.765417
6     32.333958
7     32.884167
8     33.867083
9     32.118125
10    35.219583
11    38.341250
12    38.301250
13    39.858333
14    43.189167
15    45.146667
16    43.660833
17    39.485208
18    39.085833
19    38.905208
20    39.716250
21    40.461458
22    38.502292
23    42.476458
24    44.107917
25    36.078125
26    34.836875
27    31.632708
28    31.598750
29    34.385111
30    30.845909
31    36.823214
Name: NO2, dtype: float64


In [None]:
f.groupby(['Report_date','PM10'])['NO2'].sum()

Report_date  PM10  
1            28.48     23.23
             43.38      8.89
             44.10     33.52
             56.38     31.54
             59.77     19.15
                       ...  
31           328.24    36.78
             333.32    29.29
             355.73    39.38
             420.52    21.21
             520.95    51.09
Name: NO2, Length: 1459, dtype: float64

In [4]:
## Merging and joining dataframe
df1=pd.DataFrame({'Key':['A','B','C'], 'Value1': [1,2,3]})
df2=pd.DataFrame({'Key':['A','B','D'], 'Value2': [4,5,6]})

In [5]:
df1

Unnamed: 0,Key,Value1
0,A,1
1,B,2
2,C,3


In [6]:
df2

Unnamed: 0,Key,Value2
0,A,4
1,B,5
2,D,6


In [8]:
## Merging two dataframe on the key columns
pd.merge(df1,df2,on="Key",how="inner")

Unnamed: 0,Key,Value1,Value2
0,A,1,4
1,B,2,5
