In [82]:
# import pandas and matplotlib
import pandas as pd
from matplotlib import pyplot as plt
from bokeh.io import output_notebook, show
from bokeh.layouts import column
from bokeh.plotting import figure, output_file, show, reset_output
from bokeh.models import SingleIntervalTicker, LinearAxis
output_notebook()

In [83]:
# reading the dataset (.csv file)
maindf = pd.read_csv("./CSV_files/AirQualityIndia.csv",  encoding = "latin1")

  interactivity=interactivity, compiler=compiler, result=result)


In [84]:
# a sample of five random rows from the dataset
maindf.sample(n=5)

Unnamed: 0,stn_code,sampling_date,state,location,agency,type,so2,no2,rspm,spm,location_monitoring_station,pm2_5,date
350444,,7/9/2008,Tamil Nadu,Salem,,Residential and others,8.1,23.8,60.0,86.0,Sowdeswari College,,2008-09-07
14610,465.0,20/4/2011,Andhra Pradesh,Ramagundam,Andhra Pradesh State Pollution Control Board,"Residential, Rural and other Areas",6.0,9.0,45.0,135.0,"RTC Bus Depot, Karim Nagar, Ramagundam",,2011-04-20
187803,,24/12/2009,Madhya Pradesh,Ujjain,,Residential and others,7.6,9.7,54.0,90.0,Regional Office,,2009-12-24
2397,,23-12-05,Andhra Pradesh,Hyderabad,,Residential and others,5.4,27.0,104.0,278.0,Charminar,,2005-12-23
377115,,21/6/2006,Uttar Pradesh,Lucknow,,Residential and others,10.7,32.2,201.0,429.0,Aminabad,,2006-06-21


In [85]:
# dimensions of the dataset
maindf.shape

(435742, 13)

In [86]:
# datatype of values in each column
maindf.dtypes

stn_code                        object
sampling_date                   object
state                           object
location                        object
agency                          object
type                            object
so2                            float64
no2                            float64
rspm                           float64
spm                            float64
location_monitoring_station     object
pm2_5                          float64
date                            object
dtype: object

In [87]:
# number of unique values in each column
for col in maindf.columns.tolist():
    print("{}: {}".format(col, maindf[col].unique().size))

stn_code: 804
sampling_date: 5486
state: 37
location: 305
agency: 65
type: 11
so2: 4198
no2: 6865
rspm: 6066
spm: 6669
location_monitoring_station: 992
pm2_5: 434
date: 5068


In [88]:
# unique values in each column and their corresponding number of occurances
for col in maindf.columns:
    print(col)
    print(maindf[col].value_counts(dropna=False))
    print("\n")

stn_code
NaN        144077
193.0        1428
519.0        1280
708.0        1273
541.0        1270
710.0        1269
SAMP         1187
132          1180
61.0         1148
106.0        1125
268          1112
263.0        1106
34           1098
711.0        1092
117          1074
76.0         1066
35           1043
596.0        1041
271          1004
339           994
118           990
301.0         954
302.0         943
264.0         929
335.0         922
322.0         902
17.0          883
464.0         856
29.0          835
131.0         830
            ...  
173.0           9
179.0           9
230.0           9
10003.0         9
10002.0         9
20005.0         8
20001.0         8
244.0           8
798.0           7
175.0           7
261.0           6
211.0           6
141             6
30001.0         5
30003.0         5
30004.0         5
5.0             4
195.0           4
213.0           3
241.0           3
191.0           3
208.0           2
205.0           2
53.0            2
5

NaN             237387
 0.000000         1213
 83.000000         726
 175.000000        685
 169.000000        679
 106.000000        678
 144.000000        668
 179.000000        666
 185.000000        662
 172.000000        659
 195.000000        658
 153.000000        657
 159.000000        656
 118.000000        656
 95.000000         656
 107.000000        653
 155.000000        651
 192.000000        651
 164.000000        651
 101.000000        650
 108.000000        649
 88.000000         649
 115.000000        648
 187.000000        648
 183.000000        647
 160.000000        647
 174.000000        647
 162.000000        646
 147.000000        645
 103.000000        645
                 ...  
 316.796667          1
 411.666667          1
 371.876667          1
 50.750000           1
 1058.333333         1
 64.420000           1
 53.273333           1
 114.725000          1
 22.640000           1
 28.540000           1
 51.093333           1
 14.690000           1
 471.920000

In [89]:
maindf.drop(["stn_code", "agency", "location_monitoring_station", "pm2_5"], axis=1, inplace=True)

In [90]:
# We find that the only rows where 'sampling_date' is not equal to 'date' are the ones which have no data recorded. So we drop 
# those rows and one of the columns out of 'sampling_date' and 'date'
maindf.loc[maindf["sampling_date"]!=maindf["sampling_date"]]

Unnamed: 0,sampling_date,state,location,type,so2,no2,rspm,spm,date
435739,,andaman-and-nicobar-islands,,,,,,,
435740,,Lakshadweep,,,,,,,
435741,,Tripura,,,,,,,


In [91]:
# dropping 'sampling_date' column because the date in that column is represented in different formats whereas it is not so in 
# the 'date' column.
maindf.drop("sampling_date", axis=1, inplace=True)

# rename the 'date' column as 'sampling_date'
maindf.rename(columns={"date":"sampling_date"}, inplace=True)

In [92]:
# checking the dimensions and column names after dropping and renaming the columns
print(maindf.shape)
maindf.columns.tolist()

(435742, 8)


['state', 'location', 'type', 'so2', 'no2', 'rspm', 'spm', 'sampling_date']

In [93]:
# the states for which no information is available have only one row dedicated to them. So we delete those row where the
# value_counts() function for that particular state returns 1.
for state in maindf["state"].unique().tolist():
    if(maindf["state"].value_counts()[state]==1):
        maindf.drop(maindf[maindf["state"]==state].index, inplace=True)

In [94]:
# print the number of states remaining in the dataset
len(maindf["state"].unique().tolist())

33

In [95]:
# delete all the rows where no information is available
maindf = maindf.drop(maindf[(maindf["so2"].isnull()) & (maindf["no2"].isnull()) &
                   (maindf["rspm"].isnull()) & (maindf["spm"].isnull())].index)

In [96]:
maindf.loc[maindf["type"]=="RIRUO","type"] = "Residential, Rural and other Areas"
maindf.loc[maindf["type"]=="Industrial Area", "type"] = "Industrial"
maindf.loc[maindf["type"]=="Industrial Areas", "type"] = "Industrial"
maindf.loc[maindf["type"]=="Sensitive Area", "type"] = "Sensitive"
maindf.loc[maindf["type"]=="Sensitive Areas", "type"] = "Sensitive"
maindf.loc[maindf["type"]=="Residential and others", "type"] = "Residential"

In [97]:
maindf["type"].unique().tolist()

['Residential, Rural and other Areas',
 'Industrial',
 nan,
 'Sensitive',
 'Residential']

In [98]:
# coverting the datatype of 'sampling_date' from 'object' to 'datetime'
maindf["sampling_date"] = pd.to_datetime(maindf["sampling_date"], format="%Y/%m/%d")
print(maindf["sampling_date"].dtype)
print(maindf["sampling_date"].head())
print(maindf["sampling_date"].head().tolist())

datetime64[ns]
0   1990-02-01
1   1990-02-01
2   1990-02-01
3   1990-03-01
4   1990-03-01
Name: sampling_date, dtype: datetime64[ns]
[Timestamp('1990-02-01 00:00:00'), Timestamp('1990-02-01 00:00:00'), Timestamp('1990-02-01 00:00:00'), Timestamp('1990-03-01 00:00:00'), Timestamp('1990-03-01 00:00:00')]


In [99]:
# the last date of reporting
maindf["sampling_date"].max()

Timestamp('2015-12-31 00:00:00')

In [100]:
# the first date of reporting
maindf["sampling_date"].min()

Timestamp('1987-01-01 00:00:00')

In [101]:
# states and number of state which have data before 1990
print(maindf.loc[maindf["sampling_date"]<"1990-01-01"]["state"].unique())
print(maindf.loc[maindf["sampling_date"]<"1990-01-01"]["state"].unique().size)

['Bihar' 'Chandigarh' 'Daman & Diu' 'Delhi' 'Goa' 'Gujarat' 'Haryana'
 'Himachal Pradesh' 'Karnataka' 'Kerala' 'Madhya Pradesh' 'Maharashtra'
 'Odisha' 'Puducherry' 'Punjab' 'Rajasthan' 'Tamil Nadu' 'Uttar Pradesh'
 'West Bengal']
19


In [102]:
# states and number of state which have data between 1990 and 2000
print(maindf.loc[maindf["sampling_date"].between(left="1990-01-01", right="2000-01-01")]["state"].unique())
print(maindf.loc[maindf["sampling_date"].between(left="1990-01-01", right="2000-01-01")]["state"].unique().size)

['Andhra Pradesh' 'Assam' 'Bihar' 'Chandigarh' 'Chhattisgarh'
 'Dadra & Nagar Haveli' 'Daman & Diu' 'Delhi' 'Goa' 'Gujarat' 'Haryana'
 'Himachal Pradesh' 'Karnataka' 'Kerala' 'Madhya Pradesh' 'Maharashtra'
 'Meghalaya' 'Odisha' 'Puducherry' 'Punjab' 'Rajasthan' 'Tamil Nadu'
 'Uttar Pradesh' 'West Bengal']
24


In [103]:
# states and number of state which have data between 2010 and 2015
print(maindf.loc[maindf["sampling_date"].between(left="2010-01-01", right="2015-01-01")]["state"].unique())
print(maindf.loc[maindf["sampling_date"].between(left="2010-01-01", right="2015-01-01")]["state"].unique().size)

['Andhra Pradesh' 'Arunachal Pradesh' 'Assam' 'Bihar' 'Chandigarh'
 'Chhattisgarh' 'Dadra & Nagar Haveli' 'Daman & Diu' 'Delhi' 'Goa'
 'Gujarat' 'Haryana' 'Himachal Pradesh' 'Jammu & Kashmir' 'Jharkhand'
 'Karnataka' 'Kerala' 'Madhya Pradesh' 'Maharashtra' 'Meghalaya' 'Mizoram'
 'Nagaland' 'Odisha' 'Puducherry' 'Punjab' 'Rajasthan' 'Tamil Nadu'
 'Telangana' 'Uttar Pradesh' 'Uttarakhand' 'West Bengal']
31


In [104]:
print(maindf.loc[maindf["sampling_date"]<"2016-01-01"]["state"].unique())
print(maindf.loc[maindf["sampling_date"]<"2016-01-01"]["state"].unique().size)

['Andhra Pradesh' 'Arunachal Pradesh' 'Assam' 'Bihar' 'Chandigarh'
 'Chhattisgarh' 'Dadra & Nagar Haveli' 'Daman & Diu' 'Delhi' 'Goa'
 'Gujarat' 'Haryana' 'Himachal Pradesh' 'Jammu & Kashmir' 'Jharkhand'
 'Karnataka' 'Kerala' 'Madhya Pradesh' 'Maharashtra' 'Manipur' 'Meghalaya'
 'Mizoram' 'Nagaland' 'Odisha' 'Puducherry' 'Punjab' 'Rajasthan'
 'Tamil Nadu' 'Telangana' 'Uttar Pradesh' 'Uttarakhand' 'Uttaranchal'
 'West Bengal']
33


In [105]:
# first and last date of reporting fo each state
for state in maindf["state"].unique().tolist():
    print(state + ": ")
    print(maindf[maindf["state"]==state]["sampling_date"].min())
    print(maindf[maindf["state"]==state]["sampling_date"].max())
    print("\n")

Andhra Pradesh: 
1990-02-01 00:00:00
2015-12-28 00:00:00


Arunachal Pradesh: 
2014-05-08 00:00:00
2015-12-22 00:00:00


Assam: 
1991-01-01 00:00:00
2015-12-31 00:00:00


Bihar: 
1987-01-12 00:00:00
2012-12-31 00:00:00


Chandigarh: 
1989-02-01 00:00:00
2015-12-31 00:00:00


Chhattisgarh: 
1993-10-01 00:00:00
2015-12-31 00:00:00


Dadra & Nagar Haveli: 
1992-06-01 00:00:00
2015-12-30 00:00:00


Daman & Diu: 
1989-01-01 00:00:00
2015-12-30 00:00:00


Delhi: 
1987-01-09 00:00:00
2015-12-31 00:00:00


Goa: 
1987-01-04 00:00:00
2015-12-31 00:00:00


Gujarat: 
1987-01-01 00:00:00
2015-12-31 00:00:00


Haryana: 
1987-01-06 00:00:00
2015-04-30 00:00:00


Himachal Pradesh: 
1987-01-09 00:00:00
2015-12-31 00:00:00


Jammu & Kashmir: 
2009-01-06 00:00:00
2015-12-31 00:00:00


Jharkhand: 
2004-01-09 00:00:00
2015-12-30 00:00:00


Karnataka: 
1987-01-04 00:00:00
2015-12-31 00:00:00


Kerala: 
1987-03-10 00:00:00
2015-12-31 00:00:00


Madhya Pradesh: 
1988-01-07 00:00:00
2015-12-31 00:00:00


Mahar

In [106]:
# create dataframe containing 
df1 = maindf.groupby(by=["state", "location", "sampling_date"])["state", "location", "sampling_date", "so2", "no2", "rspm", "spm"].mean()

In [107]:
df1.shape

(304832, 4)

In [108]:
df1.dtypes

so2     float64
no2     float64
rspm    float64
spm     float64
dtype: object

In [109]:
df1.sample(n=5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,so2,no2,rspm,spm
state,location,sampling_date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Himachal Pradesh,Paonta Sahib,2004-09-21,2.0,10.0,,307.0
Madhya Pradesh,Amlai,2015-03-25,17.0,28.0,,
Maharashtra,Nashik,2013-05-22,27.5,30.5,85.0,
Maharashtra,Latur,2010-09-27,5.0,14.0,126.666667,234.0
Maharashtra,Latur,2013-09-17,5.0,12.0,51.0,


## Analyze data from 1990-2000

In [110]:
df2 = df1.groupby(by=["state", "sampling_date"])["so2", "no2", "rspm", "spm"].mean()

In [111]:
df2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,so2,no2,rspm,spm
state,sampling_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Andhra Pradesh,1990-02-01,4.7,17.633333,,
Andhra Pradesh,1990-03-01,5.8,15.966667,,
Andhra Pradesh,1990-04-01,4.766667,16.266667,,
Andhra Pradesh,1990-05-01,3.8,13.75,,
Andhra Pradesh,1990-06-01,4.266667,15.066667,,108.666667


In [112]:
states = df2.index.get_level_values("state").unique().tolist()
print(int(len(states)/2))

d = []

for state in states:
    
    so2conc = df2.loc[(df2.index.get_level_values("sampling_date")>"1990") &
                  (df2.index.get_level_values("sampling_date")<"2000") &
                  (df2.index.get_level_values("state")==state)]["so2"].tolist()
    
    no2conc = df2.loc[(df2.index.get_level_values("sampling_date")>"1990") &
                  (df2.index.get_level_values("sampling_date")<"2000") &
                  (df2.index.get_level_values("state")==state)]["no2"].tolist()

    dates = df2.loc[(df2.index.get_level_values("sampling_date")>"1990") &
                    (df2.index.get_level_values("sampling_date")<"2000") &
                    (df2.index.get_level_values("state")==state)].index.get_level_values("sampling_date").date.tolist()
    
    p = figure(title = "concentration (2006-2015)", x_axis_type="datetime",
               x_axis_label=("Year (" + state + ")"), y_axis_label="Concentration (microgram per cubic meter)",
               plot_width=800, plot_height=250, tools = "pan,wheel_zoom,box_zoom,reset")
    
    p.line(dates, so2conc, line_width = 1, line_color="orange")
    p.line(dates, no2conc, line_width = 1)
    d.append(p)

show(column(d))

16


In [113]:
states = df2.index.get_level_values("state").unique().tolist()
print(int(len(states)/2))

d = []

for state in states:
    
    spm = df2.loc[(df2.index.get_level_values("sampling_date")>"1990") &
                  (df2.index.get_level_values("sampling_date")<"2000") &
                  (df2.index.get_level_values("state")==state)]["spm"].tolist()
    
    rspm = df2.loc[(df2.index.get_level_values("sampling_date")>"1990") &
                  (df2.index.get_level_values("sampling_date")<"2000") &
                  (df2.index.get_level_values("state")==state)]["rspm"].tolist()

    dates = df2.loc[(df2.index.get_level_values("sampling_date")>"1990") &
                    (df2.index.get_level_values("sampling_date")<"2000") &
                    (df2.index.get_level_values("state")==state)].index.get_level_values("sampling_date").date.tolist()
    
    #ax = fig.add_subplot(6,2,i)
    p = figure(title = "concentration (2006-2015)", x_axis_type="datetime",
               x_axis_label=("Year (" + state + ")"), y_axis_label="Concentration (microgram per cubic meter)",
               plot_width=800, plot_height=250, tools = "pan,wheel_zoom,box_zoom,reset")
    
    p.line(dates, spm, line_width = 1, line_color="orange")
    p.line(dates, rspm, line_width = 1)
    d.append(p)

show(column(d))

16


In [114]:
df2.count()

so2     78806
no2     82119
rspm    78370
spm     48073
dtype: int64

## Analyze data based on the type of place

In [115]:
df3 = maindf.groupby(by=["state", "type", "sampling_date"])["so2", "no2", "rspm", "spm"].mean()

In [116]:
df3.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,so2,no2,rspm,spm
state,type,sampling_date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Andhra Pradesh,Industrial,1990-02-01,3.1,7.0,,
Andhra Pradesh,Industrial,1990-03-01,4.7,7.5,,
Andhra Pradesh,Industrial,1990-04-01,4.7,8.7,,
Andhra Pradesh,Industrial,1990-05-01,4.0,8.9,,
Andhra Pradesh,Industrial,1990-06-01,5.6,11.8,,82.0


In [136]:
dates = df3.index.get_level_values("sampling_date").to_pydatetime().date()

TypeError: 'numpy.ndarray' object is not callable

In [137]:
dates

[datetime.date(1990, 2, 1),
 datetime.date(1990, 3, 1),
 datetime.date(1990, 4, 1),
 datetime.date(1990, 5, 1),
 datetime.date(1990, 6, 1),
 datetime.date(1990, 7, 1),
 datetime.date(1990, 8, 1),
 datetime.date(1990, 9, 1),
 datetime.date(1990, 10, 1),
 datetime.date(1990, 11, 1),
 datetime.date(1990, 12, 1),
 datetime.date(1991, 1, 1),
 datetime.date(1991, 2, 1),
 datetime.date(1991, 3, 1),
 datetime.date(1991, 4, 1),
 datetime.date(1991, 5, 1),
 datetime.date(1991, 6, 1),
 datetime.date(1991, 7, 1),
 datetime.date(1991, 8, 1),
 datetime.date(1991, 9, 1),
 datetime.date(1991, 10, 1),
 datetime.date(1991, 11, 1),
 datetime.date(1991, 12, 1),
 datetime.date(1992, 1, 1),
 datetime.date(1992, 2, 1),
 datetime.date(1992, 3, 1),
 datetime.date(1992, 4, 1),
 datetime.date(1992, 5, 1),
 datetime.date(1992, 6, 1),
 datetime.date(1992, 7, 1),
 datetime.date(1992, 8, 1),
 datetime.date(1992, 9, 1),
 datetime.date(1992, 10, 1),
 datetime.date(1992, 11, 1),
 datetime.date(1992, 12, 1),
 datetime.d

In [122]:
df3.sample(n=5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,so2,no2,rspm,spm
state,type,sampling_date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Uttarakhand,Industrial,2015-01-28,26.0,29.0,194.0,
Assam,Residential,2009-01-13,9.88,20.02,187.8,296.8
Karnataka,"Residential, Rural and other Areas",2014-01-24,8.6,18.2,125.0,
Puducherry,"Residential, Rural and other Areas",2004-09-16,24.0,17.0,33.0,48.0
Himachal Pradesh,"Residential, Rural and other Areas",2010-03-05,2.0,13.5,110.0,


In [139]:
df3.index.names

FrozenList(['state', 'type', 'sampling_date'])