In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import cross_val_score, train_test_split

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
matplotlib.style.use('ggplot')

In [2]:
houston_hourly = pd.read_csv('./clean_data/clean_houston_hourly.csv')
cincy_hourly = pd.read_csv('./clean_data/clean_cincy_hourly.csv')
newyork_hourly = pd.read_csv('./clean_data/clean_newyork_hourly.csv')
atlanta_hourly = pd.read_csv('./clean_data/clean_atlanta_hourly.csv')

In [3]:
houston_hourly.head(2)

Unnamed: 0.1,Unnamed: 0,DATE,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyPressureChange,HourlyPressureTendency,HourlyRelativeHumidity,HourlySeaLevelPressure,...,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindGustSpeed,HourlyWindSpeed,YEAR,MONTH,DAY,HOUR,CITY,DECADE
0,0,1948-07-01 00:00:00,,293.705556,294.261111,0.0,,,97.0,30.05,...,293.705556,315.0,,3.0,1948,7,1,0,Houston_TX,1940
1,1,1948-07-01 01:00:00,,293.705556,294.261111,0.0,,,97.0,30.05,...,293.705556,338.0,,6.0,1948,7,1,1,Houston_TX,1940


#### Drop_columns
A lot of features only start getting measured in the 1990s (say), which gives the model an unfair advantage. So let's drop them:

In [4]:
drop_columns = ['Unnamed: 0', 'HourlyPrecipitation','HourlyPressureChange','HourlyPressureTendency','HourlyWindGustSpeed', 
                'DATE','YEAR','CITY','DECADE']

# for the aggregated DF, which doesn't include "CITY" (since we use pd.get_dummies())
drop_columns2 = ['Unnamed: 0', 'HourlyPrecipitation','HourlyPressureChange','HourlyPressureTendency','HourlyWindGustSpeed', 
                'DATE','YEAR','DECADE']

#### Houston RandomForest

In [5]:
# set up X and Y
Xh = houston_hourly.drop(columns=drop_columns).fillna(0)
yh = houston_hourly['DECADE']

Xh_train, Xh_test, yh_train, yh_test = train_test_split(Xh,yh)

sc = StandardScaler()
Xh_train_sc = sc.fit_transform(Xh_train)
Xh_test_sc = sc.transform(Xh_test)

In [6]:
# instantiate regressor
rf = RandomForestClassifier()

rf.fit(Xh_train_sc, yh_train)

RandomForestClassifier()

In [None]:
print(f"Houston RandomForest train score: {rf.score(Xh_train_sc, yh_train)}")
print(f"Houston RandomForest test score: {rf.score(Xh_test_sc, yh_test)}")

#### Cincinnati RandomForest

In [9]:
# set up X and Y
Xc = cincy_hourly.drop(columns=drop_columns).fillna(0)
yc = cincy_hourly['DECADE']

Xc_train, Xc_test, yc_train, yc_test = train_test_split(Xc,yc)

sc = StandardScaler()
Xc_train_sc = sc.fit_transform(Xc_train)
Xc_test_sc = sc.transform(Xc_test)

In [10]:
# instantiate regressor
rf = RandomForestClassifier()

rf.fit(Xc_train_sc, yc_train)

RandomForestClassifier()

In [None]:
print(f"Cincinnati RandomForest train score: {rf.score(Xc_train_sc, yc_train)}")
print(f"Cincinnati RandomForest test score: {rf.score(Xc_test_sc, yc_test)}")

#### New York RandomForest

In [13]:
# set up X and Y
Xn = newyork_hourly.drop(columns=drop_columns).fillna(0)
yn = newyork_hourly['DECADE']

Xn_train, Xn_test, yn_train, yn_test = train_test_split(Xn,yn)

sc = StandardScaler()
Xn_train_sc = sc.fit_transform(Xn_train)
Xn_test_sc = sc.transform(Xn_test)

In [14]:
# instantiate regressor
rf = RandomForestClassifier()

rf.fit(Xn_train_sc, yn_train)

RandomForestClassifier()

In [None]:
print(f"New York RandomForest train score: {rf.score(Xn_train_sc, yn_train)}")
print(f"New York RandomForest test score: {rf.score(Xn_test_sc, yn_test)}")

#### Atlanta RandomForest

In [17]:
# set up X and Y
Xa = atlanta_hourly.drop(columns=drop_columns).fillna(0)
ya = atlanta_hourly['DECADE']

Xa_train, Xa_test, ya_train, ya_test = train_test_split(Xa,ya)

sc = StandardScaler()
Xa_train_sc = sc.fit_transform(Xa_train)
Xa_test_sc = sc.transform(Xa_test)

In [18]:
# instantiate regressor
rf = RandomForestClassifier()

rf.fit(Xa_train_sc, ya_train)

RandomForestClassifier()

In [None]:
print(f"Atlanta RandomForest train score: {rf.score(Xa_train_sc, ya_train)}")
print(f"Atlanta RandomForest test score: {rf.score(Xa_test_sc, ya_test)}")

In [21]:
all_hourly = pd.concat([houston_hourly,cincy_hourly,newyork_hourly,atlanta_hourly])
all_hourly.head()

Unnamed: 0.1,Unnamed: 0,DATE,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyPressureChange,HourlyPressureTendency,HourlyRelativeHumidity,HourlySeaLevelPressure,...,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindGustSpeed,HourlyWindSpeed,YEAR,MONTH,DAY,HOUR,CITY,DECADE
0,0,1948-07-01 00:00:00,,293.705556,294.261111,0.0,,,97.0,30.05,...,293.705556,315.0,,3.0,1948,7,1,0,Houston_TX,1940
1,1,1948-07-01 01:00:00,,293.705556,294.261111,0.0,,,97.0,30.05,...,293.705556,338.0,,6.0,1948,7,1,1,Houston_TX,1940
2,2,1948-07-01 02:00:00,,293.705556,294.261111,0.0,,,97.0,30.04,...,293.705556,360.0,,5.0,1948,7,1,2,Houston_TX,1940
3,3,1948-07-01 03:00:00,,293.705556,294.261111,0.0,,,97.0,30.04,...,293.705556,45.0,,3.0,1948,7,1,3,Houston_TX,1940
4,4,1948-07-01 04:00:00,,293.15,293.705556,0.0,,,96.0,30.04,...,293.15,338.0,,8.0,1948,7,1,4,Houston_TX,1940


In [32]:
all_hourly.drop(columns=drop_columns2).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2490626 entries, 0 to 615064
Data columns (total 16 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   HourlyAltimeterSetting     float64
 1   HourlyDewPointTemperature  float64
 2   HourlyDryBulbTemperature   float64
 3   HourlyRelativeHumidity     float64
 4   HourlySeaLevelPressure     float64
 5   HourlyStationPressure      float64
 6   HourlyVisibility           float64
 7   HourlyWetBulbTemperature   float64
 8   HourlyWindDirection        float64
 9   HourlyWindSpeed            float64
 10  MONTH                      int64  
 11  DAY                        int64  
 12  HOUR                       int64  
 13  CITY_Cincinnati_KY         uint8  
 14  CITY_Houston_TX            uint8  
 15  CITY_NewYork_NY            uint8  
dtypes: float64(10), int64(3), uint8(3)
memory usage: 273.2 MB


In [22]:
all_hourly = pd.get_dummies(all_hourly, columns = ['CITY'], drop_first=True)

#### All cities RandomForest

In [23]:
# set up X and Y
Xall = all_hourly.drop(columns=drop_columns2).fillna(0)
yall = all_hourly['DECADE']

Xall_train, Xall_test, yall_train, yall_test = train_test_split(Xall,yall)

sc = StandardScaler()
Xall_train_sc = sc.fit_transform(Xall_train)
Xall_test_sc = sc.transform(Xall_test)

In [24]:
# instantiate regressor
rf = RandomForestClassifier()

rf.fit(Xall_train_sc, yall_train)

RandomForestClassifier()

In [25]:
rf.score(Xall_train_sc, yall_train)

0.9999941112513109

In [26]:
rf.score(Xall_test_sc,yall_test)

0.8092336551263376

In [27]:
all_preds = rf.predict(Xall_test_sc)

In [28]:
from sklearn.metrics import multilabel_confusion_matrix

In [33]:
mcm = multilabel_confusion_matrix(all_preds,yall_test)
mcm

array([[[599590,  11161],
        [  1302,  10604]],

       [[507852,   5445],
        [ 27171,  82189]],

       [[554453,  15545],
        [ 10507,  42152]],

       [[548145,  18444],
        [  9296,  46772]],

       [[516153,  15901],
        [ 19500,  71103]],

       [[514205,  17555],
        [ 17620,  73277]],

       [[522170,  27475],
        [  9286,  63726]],

       [[477270,   7245],
        [ 24100, 114042]],

       [[622636,     11],
        [     0,     10]]], dtype=int64)

In [55]:
#mcm[0][1][1]
sum1=0
sum2=0
sum3=0
sum4=0

for i in range(len(mcm)):
    sum1 += mcm[i][0][0]
    sum2 += mcm[i][0][1]
    sum3 += mcm[i][1][0]
    sum4 += mcm[i][1][1]

total = sum1 + sum2 + sum3 + sum4
print(f"True negatives: {sum1/total}")
print(f"False negatives: {sum2/total}")
print(f"False positives: {sum3/total}")
print(f"True positives: {sum4/total}")

True negatives: 0.8676926283473708
False negatives: 0.021196260541518044
False positives: 0.021196260541518044
True positives: 0.08991485056959307


In [65]:
decades_list = [1940,1950,1960,1970,1980,1990,2000,2010,2020]
for i in range(len(mcm)):
    print(f"*******{decades_list[i]}*******")
    print(mcm[i])
    print('******************')


*******1940*******
[[599590  11161]
 [  1302  10604]]
******************
*******1950*******
[[507852   5445]
 [ 27171  82189]]
******************
*******1960*******
[[554453  15545]
 [ 10507  42152]]
******************
*******1970*******
[[548145  18444]
 [  9296  46772]]
******************
*******1980*******
[[516153  15901]
 [ 19500  71103]]
******************
*******1990*******
[[514205  17555]
 [ 17620  73277]]
******************
*******2000*******
[[522170  27475]
 [  9286  63726]]
******************
*******2010*******
[[477270   7245]
 [ 24100 114042]]
******************
*******2020*******
[[622636     11]
 [     0     10]]
******************
