#### Importing dependencies

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)

#### Assinging path to all the csv files

In [2]:
PATH_ALL = './assignment5/'
PATH_BEIJING = PATH_ALL + 'Beijing.csv'
PATH_CHENGDU = PATH_ALL + 'Chengdu.csv'
PATH_GUANGZHOU = PATH_ALL + 'Guangzhou.csv'
PATH_SHANGHAI = PATH_ALL + 'Shanghai.csv'
PATH_SHENYANG = PATH_ALL + 'Shenyang.csv'

#### Read csv files into dataframes

In [4]:
df_beijing = pd.read_csv(PATH_BEIJING)
df_chengdu = pd.read_csv(PATH_CHENGDU)
df_guanghzou = pd.read_csv(PATH_GUANGZHOU)
df_shanghai = pd.read_csv(PATH_SHANGHAI)
df_shenyang = pd.read_csv(PATH_SHENYANG)

#### Drop all NaN/missing values in each city's dataframe

In [22]:
df_beijing = df_beijing.dropna()
df_chengdu = df_chengdu.dropna()
df_guanghzou = df_guanghzou.dropna()
df_shanghai = df_shanghai.dropna()
df_shenyang = df_shenyang.dropna()

#### Treat the column PM as our predictive objective

In [23]:
y_beijing = df_beijing['PM']
y_chendgu = df_chengdu['PM']
y_guanghzou = df_guanghzou['PM']
y_shanghai = df_shanghai['PM']
y_shenyang = df_shenyang['PM']

#### All other columns will be used as features when training our model

In [24]:
X_beijing = df_beijing.drop(['PM'], axis=1)
X_chengdu = df_chengdu.drop(['PM'], axis=1)
X_guanghzou = df_guanghzou.drop(['PM'], axis=1)
X_shanghai = df_shanghai.drop(['PM'], axis=1)
X_shenyang = df_shenyang.drop(['PM'], axis=1)

lr: logistic regression model

Each city's data is split using scikit learn's train_test_split() method and (70% train and rest is test). This data is then trained using the fit() method. Finally, the r2 value (coefficient of determination) is calcuated and printed out.

In [34]:
lr = LinearRegression()

beijing_X_train, beijing_X_test, beijing_y_train, beijing_y_test = train_test_split(X_beijing, y_beijing, test_size=0.3, random_state=42)
lr.fit(beijing_X_train, beijing_y_train)
beijing_y_pred = lr.predict(beijing_X_test)
print('Coefficient of determination for Beijing: %.2f' % r2_score(beijing_y_test, beijing_y_pred))

chengdu_X_train, chengdu_X_test, chengdu_y_train, chengdu_y_test = train_test_split(X_chengdu, y_chendgu, test_size=0.3, random_state=42)
lr.fit(chengdu_X_train, chengdu_y_train)
chengdu_y_pred = lr.predict(chengdu_X_test)
print('Coefficient of determination for Chengdu: %.2f' % r2_score(chengdu_y_test, chengdu_y_pred))

ganghzou_X_train, gangzhou_X_test, gangzhou_y_train, gangzhou_y_test = train_test_split(X_guanghzou, y_guanghzou, test_size=0.3, random_state=42)
lr.fit(ganghzou_X_train, gangzhou_y_train)
gangzhou_y_pred = lr.predict(gangzhou_X_test)
print('Coefficient of determination for Ganghzou: %.2f' % r2_score(gangzhou_y_test, gangzhou_y_pred))

shanghai_X_train, shanghai_X_test, shanghai_y_train, shanghai_y_test = train_test_split(X_shanghai, y_shanghai, test_size=0.3, random_state=42)
lr.fit(shanghai_X_train, shanghai_y_train)
shanghai_y_pred = lr.predict(shanghai_X_test)
print('Coefficient of determination for Shanghai: %.2f' % r2_score(shanghai_y_test, shanghai_y_pred))

shenyan_X_train, shenyan_X_test, shenyan_y_train, shenyan_y_test = train_test_split(X_shenyang, y_shenyang, test_size=0.3, random_state=42)
lr.fit(shenyan_X_train, shenyan_y_train)
shenyan_y_pred = lr.predict(shenyan_X_test)
print('Coefficient of determination for Shenyan: %.2f' % r2_score(shenyan_y_test, shenyan_y_pred))

Coefficient of determination for Beijing: 0.27
Coefficient of determination for Chengdu: 0.27
Coefficient of determination for Ganghzou: 0.14
Coefficient of determination for Shanghai: 0.17
Coefficient of determination for Shenyan: 0.22


#### **1.3 - 2**

I trained using Shenyang dataset and tested using Shanghai dataset.

In [73]:
shanghai_X_train, shanghai_X_test, shanghai_y_train, shanghai_y_test = train_test_split(X_shanghai, y_shanghai, test_size=0.3, random_state=42)
shenyan_X_train, shenyan_X_test, shenyan_y_train, shenyan_y_test = train_test_split(X_shenyang, y_shenyang, test_size=0.3, random_state=42)

lr.fit(shenyan_X_train, shenyan_y_train)
shenyan_y_pred = lr.predict(shanghai_X_test)
print('Coefficient of determination for Shenyan: %.2f' % r2_score(shanghai_y_test, shenyan_y_pred))

Coefficient of determination for Shenyan: -29681.43


A negative value as r2 denotes that something is terribly wrong/skewed.

#### **1.3 - 3**

I predicted the Lws (cumulative wind speed) which showed best performance. According to my understanding, for a time series dataset, if the r2 value is close to one, it's not a good sign at all. Based on this understanding, predicting Lws gave me the best performance (lowest r2) thus making it easier to predict.

In [74]:
y_shanghai = df_shanghai['Iws']
X_shanghai = df_shanghai.drop(['Iws'], axis=1)
shanghai_X_train, shanghai_X_test, shanghai_y_train, shanghai_y_test = train_test_split(X_shanghai, y_shanghai, test_size=0.3, random_state=42)
lr.fit(shanghai_X_train, shanghai_y_train)
shanghai_y_pred = lr.predict(shanghai_X_test)
print('Coefficient of determination for Shanghai: %.2f' % r2_score(shanghai_y_test, shanghai_y_pred))

Coefficient of determination for Shanghai: 0.06
