In [1]:
import pandas as pd

### Load the data into `pandas` data frame

In [2]:
airquality = pd.read_csv("airquality.csv")
airquality = airquality[['Month','Day','Ozone','Solar.R','Temp','Wind']]
airquality

Unnamed: 0,Month,Day,Ozone,Solar.R,Temp,Wind
0,5,1,41.0,190.0,67,7.4
1,5,2,36.0,118.0,72,8.0
2,5,3,12.0,149.0,74,12.6
3,5,4,18.0,313.0,62,11.5
4,5,5,,,56,14.3
...,...,...,...,...,...,...
148,9,26,30.0,193.0,70,6.9
149,9,27,,145.0,77,13.2
150,9,28,14.0,191.0,75,14.3
151,9,29,18.0,131.0,76,8.0


### Subsetting a column

In [3]:
airquality['Ozone']

0      41.0
1      36.0
2      12.0
3      18.0
4       NaN
       ... 
148    30.0
149     NaN
150    14.0
151    18.0
152    20.0
Name: Ozone, Length: 153, dtype: float64

### Note the difference with the code below

In [4]:
airquality[['Ozone']]

Unnamed: 0,Ozone
0,41.0
1,36.0
2,12.0
3,18.0
4,
...,...
148,30.0
149,
150,14.0
151,18.0


### Subsetting more than one column

In [5]:
airquality[['Ozone', 'Temp']]

Unnamed: 0,Ozone,Temp
0,41.0,67
1,36.0,72
2,12.0,74
3,18.0,62
4,,56
...,...,...
148,30.0,70
149,,77
150,14.0,75
151,18.0,76


### Subsetting rows

In [6]:
airquality[:5]

Unnamed: 0,Month,Day,Ozone,Solar.R,Temp,Wind
0,5,1,41.0,190.0,67,7.4
1,5,2,36.0,118.0,72,8.0
2,5,3,12.0,149.0,74,12.6
3,5,4,18.0,313.0,62,11.5
4,5,5,,,56,14.3


### Handling missing values

In [7]:
airquality.head(10)

Unnamed: 0,Month,Day,Ozone,Solar.R,Temp,Wind
0,5,1,41.0,190.0,67,7.4
1,5,2,36.0,118.0,72,8.0
2,5,3,12.0,149.0,74,12.6
3,5,4,18.0,313.0,62,11.5
4,5,5,,,56,14.3
5,5,6,28.0,,66,14.9
6,5,7,23.0,299.0,65,8.6
7,5,8,19.0,99.0,59,13.8
8,5,9,8.0,19.0,61,20.1
9,5,10,,194.0,69,8.6


In [8]:
airquality_no_na = airquality.dropna()
airquality_no_na.head(10)

Unnamed: 0,Month,Day,Ozone,Solar.R,Temp,Wind
0,5,1,41.0,190.0,67,7.4
1,5,2,36.0,118.0,72,8.0
2,5,3,12.0,149.0,74,12.6
3,5,4,18.0,313.0,62,11.5
6,5,7,23.0,299.0,65,8.6
7,5,8,19.0,99.0,59,13.8
8,5,9,8.0,19.0,61,20.1
11,5,12,16.0,256.0,69,9.7
12,5,13,11.0,290.0,66,9.2
13,5,14,14.0,274.0,68,10.9


In [9]:
airquality_linear = airquality.interpolate(method='linear')
airquality_linear.head(10)

Unnamed: 0,Month,Day,Ozone,Solar.R,Temp,Wind
0,5,1,41.0,190.0,67,7.4
1,5,2,36.0,118.0,72,8.0
2,5,3,12.0,149.0,74,12.6
3,5,4,18.0,313.0,62,11.5
4,5,5,23.0,308.333333,56,14.3
5,5,6,28.0,303.666667,66,14.9
6,5,7,23.0,299.0,65,8.6
7,5,8,19.0,99.0,59,13.8
8,5,9,8.0,19.0,61,20.1
9,5,10,7.5,194.0,69,8.6


### Joining data sets

In [10]:
cases_df = pd.read_csv("covid_new_cases.csv", index_col = 0)
deaths_df = pd.read_csv("covid_new_deaths.csv", index_col = 0)
cases_df

Unnamed: 0_level_0,new_cases
date,Unnamed: 1_level_1
2020-01-30,2
2020-01-31,0
2020-02-01,0
2020-02-02,0
2020-02-03,0
...,...
2020-10-25,15654
2020-10-26,26467
2020-10-27,23757
2020-10-28,22887


In [11]:
deaths_df

Unnamed: 0_level_0,new_deaths
date,Unnamed: 1_level_1
2020-02-29,0
2020-03-01,0
2020-03-02,1
2020-03-03,2
2020-03-04,0
...,...
2020-10-25,234
2020-10-26,253
2020-10-27,227
2020-10-28,216


In [12]:
df_1 = cases_df.join(deaths_df)
df_1

Unnamed: 0_level_0,new_cases,new_deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-30,2,
2020-01-31,0,
2020-02-01,0,
2020-02-02,0,
2020-02-03,0,
...,...,...
2020-10-25,15654,234.0
2020-10-26,26467,253.0
2020-10-27,23757,227.0
2020-10-28,22887,216.0


In [13]:
df_2 = cases_df.join(deaths_df, how = 'right')
df_2

Unnamed: 0_level_0,new_cases,new_deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-29,5,0
2020-03-01,22,0
2020-03-02,40,1
2020-03-03,56,2
2020-03-04,56,0
...,...,...
2020-10-25,15654,234
2020-10-26,26467,253
2020-10-27,23757,227
2020-10-28,22887,216


In [14]:
df_3 = cases_df.join(deaths_df, how = 'inner')
df_3

Unnamed: 0_level_0,new_cases,new_deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-29,5,0
2020-03-01,22,0
2020-03-02,40,1
2020-03-03,56,2
2020-03-04,56,0
...,...,...
2020-10-25,15654,234
2020-10-26,26467,253
2020-10-27,23757,227
2020-10-28,22887,216


In [15]:
df_4 = cases_df.join(deaths_df, how = 'outer')
df_4

Unnamed: 0_level_0,new_cases,new_deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-30,2,
2020-01-31,0,
2020-02-01,0,
2020-02-02,0,
2020-02-03,0,
...,...,...
2020-10-25,15654,234.0
2020-10-26,26467,253.0
2020-10-27,23757,227.0
2020-10-28,22887,216.0
