In [93]:
## small investigation into transfer learning and the PM2.5 dataset
import pandas as pd
import numpy as np
import sys
import os
import glob
from sklearn.model_selection import train_test_split, cross_val_score
import sklearn.linear_model as sklinear
import sklearn.ensemble as sklens
import matplotlib.pyplot as plt

def wind_direction_to_int(cbwd):
        if cbwd=="cv":
            return 0
        elif cbwd=="N":
            return 1
        elif cbwd=="NW":
            return 2
        elif cbwd=="W":
            return 3
        elif cbwd=="SW":
            return 4
        elif cbwd=="S":
            return 5
        elif cbwd=="SE":
            return 6
        elif cbwd=="E":
            return 7
        elif cbwd=="NE":
            return 8

def make_data_into_vector(row):
    #"cbwd","year","month","day","season","DEWP","TEMP","HUMI","PRES","Iws","precipitation","Iprec"
    x=np.zeros(12)
    x[0] = row['cbwd']
    x[1] = row['year']
    x[2] = row['month']
    x[3] = row['day']
    x[4] = row['season']
    x[5] = row['DEWP']
    x[6] = row['TEMP']
    x[7] = row['HUMI']
    x[8] = row['PRES']
    x[9] = row['Iws']
    x[10] = row['precipitation']
    x[11] = row['Iprec']
    return x
    
'''
No: row number
year: year of data in this row
month: month of data in this row
day: day of data in this row
hour: hour of data in this row
season: season of data in this row
PM_{placename}: PM2.5 concentration (ug/m^3)
DEWP: Dew Point (Celsius Degree)
TEMP: Temperature (Celsius Degree)
HUMI: Humidity (%)
PRES: Pressure (hPa)
cbwd: Combined wind direction
Iws: Cumulated wind speed (m/s)
precipitation: hourly precipitation (mm)
Iprec: Cumulated precipitation (mm)
'''

files = glob.glob("data/*.csv")

# load every csv into a dataframe
dfs = [pd.read_csv(f) for f in files]
"""
read in this order
------------------
data/ShanghaiPM20100101_20151231.csv
data/GuangzhouPM20100101_20151231.csv
data/BeijingPM20100101_20151231.csv
data/ShenyangPM20100101_20151231.csv
data/ChengduPM20100101_20151231.csv
"""
cities=["Shanghai","Guangzhou","Beijing","Shenyang","Chengdu"]

years=[2010,2011,2012,2013,2014,2015]
nonfeatures=["No","cbwd","year","month","day","hour","season"]
features2=["DEWP","TEMP","HUMI","PRES","Iws","precipitation","Iprec"]
features3=['PM_Dongsi','PM_Dongsihuan', 'PM_Nongzhanguan', 'PM_US Post']
ylabels=["PM2.5 concentration (ug/m^3)","PM2.5 concentration (ug/m^3)","PM2.5 concentration (ug/m^3)","Dew Point ($^{\circ{}}C$)","Temperature ($^{\circ{}}C$)","Humidity (%)"
         ,"Pressure (hPa)","Cumulated wind speed (m/s)","hourly precipitation (mm)",
         "Cumulated precipitation (mm)"]
ylabels2=["Dew Point ($^{\circ{}}C$)","Temperature ($^{\circ{}}C$)","Humidity (%)"
         ,"Pressure (hPa)","Cumulated wind speed (m/s)","hourly precipitation (mm)",
         "Cumulated precipitation (mm)"]

## remove outliers (mainly concerns Beijing and Chengdu)
for i in range(5):
        df=dfs[i]
        features=df.columns
        # fills NaN's with 0
        df=df.fillna(0)
        
        # remove datapoints outside 3 std
        for f in features:
            if f not in nonfeatures:    
                df=df[np.abs(df.loc[:,f]-df.loc[:,f].mean())<=(3*df.loc[:,f].std())]
                dfs[i]=df
        


## only take 15:00 every day
for i in range(5):
    df=dfs[i]
    dfs[i]=df[df["hour"]==15]


# change the winddirection to be integers    
for i in range(5):
    df=dfs[i]
    df["cbwd"]=df["cbwd"].apply(lambda x: wind_direction_to_int(x))
    dfs[i]=df

    
    ## normalise values here
for i in range(5):
    df=dfs[i]
    norm_df=(df-df.min())/(df.max()-df.min())
    dfs[i]=norm_df


    
## train classifier to predict city.


x_arrays = []
outcomes = []
for j in range(5):
    df=dfs[j]
    for index, row in df.iterrows():
            x = make_data_into_vector(row)
            x_arrays.append(x)
            outcomes.append(j)

X = np.stack(x_arrays, axis=0)
y = np.array(outcomes)

## split all dataframes into train and test set
X_train,X_test,y_train,y_test =train_test_split(X,y)

     
print(y_train)
random_forest = sklens.RandomForestRegressor().fit(X_train, y_train)
score_rf = random_forest.score(X_test, y_test)
print('random forest score', score_rf)
log_regressor = sklinear.LogisticRegression(random_state=1, max_iter=200).fit(X_train, y_train)
print('log cv score', np.mean(cross_val_score(sklinear.LogisticRegression(random_state=1, max_iter=200), X_test, y_test, cv=10)))   


   


[4 1 1 ... 0 0 2]
random forest score 0.7077492149461007
log cv score 0.5373124497991968
