In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import cross_val_score, train_test_split

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
matplotlib.style.use('ggplot')

In [2]:
houston_hourly = pd.read_csv('./clean_data/clean_houston_hourly.csv')
cincy_hourly = pd.read_csv('./clean_data/clean_cincy_hourly.csv')
newyork_hourly = pd.read_csv('./clean_data/clean_newyork_hourly.csv')
atlanta_hourly = pd.read_csv('./clean_data/clean_atlanta_hourly.csv')

In [3]:
houston_hourly.head(2)

Unnamed: 0.1,Unnamed: 0,DATE,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyPressureChange,HourlyPressureTendency,HourlyRelativeHumidity,HourlySeaLevelPressure,...,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindGustSpeed,HourlyWindSpeed,YEAR,MONTH,DAY,HOUR,CITY,DECADE
0,0,1948-07-01 00:00:00,,293.705556,294.261111,0.0,,,97.0,30.05,...,293.705556,315.0,,3.0,1948,7,1,0,Houston_TX,1940
1,1,1948-07-01 01:00:00,,293.705556,294.261111,0.0,,,97.0,30.05,...,293.705556,338.0,,6.0,1948,7,1,1,Houston_TX,1940


#### Drop_columns
A lot of features only start getting measured in the 1990s (say), which gives the model an unfair advantage. So let's drop them:

In [4]:
drop_columns = ['Unnamed: 0', 'HourlyPrecipitation','HourlyPressureChange','HourlyPressureTendency','HourlyWindGustSpeed', 
                'DATE','YEAR','CITY','DECADE']

# for the aggregated DF, which doesn't include "CITY" (since we use pd.get_dummies())
drop_columns2 = ['Unnamed: 0', 'HourlyPrecipitation','HourlyPressureChange','HourlyPressureTendency','HourlyWindGustSpeed', 
                'DATE','YEAR','DECADE']

#### Houston RandomForest

In [5]:
# set up X and Y
Xh = houston_hourly.drop(columns=drop_columns).fillna(0)
yh = houston_hourly['DECADE']

Xh_train, Xh_test, yh_train, yh_test = train_test_split(Xh,yh)

sc = StandardScaler()
Xh_train_sc = sc.fit_transform(Xh_train)
Xh_test_sc = sc.transform(Xh_test)

In [6]:
# instantiate regressor
rf = RandomForestClassifier()

rf.fit(Xh_train_sc, yh_train)

RandomForestClassifier()

In [7]:
print(f"Houston RandomForest train score: {rf.score(Xh_train_sc, yh_train)}")
print(f"Houston RandomForest test score: {rf.score(Xh_test_sc, yh_test)}")

Houston RandomForest train score: 0.9999842852717077
Houston RandomForest test score: 0.8368680167832921


#### Cincinnati RandomForest

In [8]:
# set up X and Y
Xc = cincy_hourly.drop(columns=drop_columns).fillna(0)
yc = cincy_hourly['DECADE']

Xc_train, Xc_test, yc_train, yc_test = train_test_split(Xc,yc)

sc = StandardScaler()
Xc_train_sc = sc.fit_transform(Xc_train)
Xc_test_sc = sc.transform(Xc_test)

In [9]:
# instantiate regressor
rf = RandomForestClassifier()

rf.fit(Xc_train_sc, yc_train)

RandomForestClassifier()

In [10]:
print(f"Cincinnati RandomForest train score: {rf.score(Xc_train_sc, yc_train)}")
print(f"Cincinnati RandomForest test score: {rf.score(Xc_test_sc, yc_test)}")

Cincinnati RandomForest train score: 1.0
Cincinnati RandomForest test score: 0.7938100506973226


#### New York RandomForest

In [11]:
# set up X and Y
Xn = newyork_hourly.drop(columns=drop_columns).fillna(0)
yn = newyork_hourly['DECADE']

Xn_train, Xn_test, yn_train, yn_test = train_test_split(Xn,yn)

sc = StandardScaler()
Xn_train_sc = sc.fit_transform(Xn_train)
Xn_test_sc = sc.transform(Xn_test)

In [12]:
# instantiate regressor
rf = RandomForestClassifier()

rf.fit(Xn_train_sc, yn_train)

RandomForestClassifier()

In [13]:
print(f"New York RandomForest train score: {rf.score(Xn_train_sc, yn_train)}")
print(f"New York RandomForest test score: {rf.score(Xn_test_sc, yn_test)}")

New York RandomForest train score: 1.0
New York RandomForest test score: 0.796161828958932


#### Atlanta RandomForest

In [14]:
# set up X and Y
Xa = atlanta_hourly.drop(columns=drop_columns).fillna(0)
ya = atlanta_hourly['DECADE']

Xa_train, Xa_test, ya_train, ya_test = train_test_split(Xa,ya)

sc = StandardScaler()
Xa_train_sc = sc.fit_transform(Xa_train)
Xa_test_sc = sc.transform(Xa_test)

In [15]:
# instantiate regressor
rf = RandomForestClassifier()

rf.fit(Xa_train_sc, ya_train)

RandomForestClassifier()

In [16]:
print(f"Atlanta RandomForest train score: {rf.score(Xa_train_sc, ya_train)}")
print(f"Atlanta RandomForest test score: {rf.score(Xa_test_sc, ya_test)}")

Atlanta RandomForest train score: 1.0
Atlanta RandomForest test score: 0.8131198501629088


In [17]:
all_hourly = pd.concat([houston_hourly,cincy_hourly,newyork_hourly,atlanta_hourly])
all_hourly.head()

Unnamed: 0.1,Unnamed: 0,DATE,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyPressureChange,HourlyPressureTendency,HourlyRelativeHumidity,HourlySeaLevelPressure,...,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindGustSpeed,HourlyWindSpeed,YEAR,MONTH,DAY,HOUR,CITY,DECADE
0,0,1948-07-01 00:00:00,,293.705556,294.261111,0.0,,,97.0,30.05,...,293.705556,315.0,,3.0,1948,7,1,0,Houston_TX,1940
1,1,1948-07-01 01:00:00,,293.705556,294.261111,0.0,,,97.0,30.05,...,293.705556,338.0,,6.0,1948,7,1,1,Houston_TX,1940
2,2,1948-07-01 02:00:00,,293.705556,294.261111,0.0,,,97.0,30.04,...,293.705556,360.0,,5.0,1948,7,1,2,Houston_TX,1940
3,3,1948-07-01 03:00:00,,293.705556,294.261111,0.0,,,97.0,30.04,...,293.705556,45.0,,3.0,1948,7,1,3,Houston_TX,1940
4,4,1948-07-01 04:00:00,,293.15,293.705556,0.0,,,96.0,30.04,...,293.15,338.0,,8.0,1948,7,1,4,Houston_TX,1940


In [18]:
all_hourly.drop(columns=drop_columns2).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2490626 entries, 0 to 615064
Data columns (total 14 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   HourlyAltimeterSetting     float64
 1   HourlyDewPointTemperature  float64
 2   HourlyDryBulbTemperature   float64
 3   HourlyRelativeHumidity     float64
 4   HourlySeaLevelPressure     float64
 5   HourlyStationPressure      float64
 6   HourlyVisibility           float64
 7   HourlyWetBulbTemperature   float64
 8   HourlyWindDirection        float64
 9   HourlyWindSpeed            float64
 10  MONTH                      int64  
 11  DAY                        int64  
 12  HOUR                       int64  
 13  CITY                       object 
dtypes: float64(10), int64(3), object(1)
memory usage: 285.0+ MB


In [19]:
all_hourly = pd.get_dummies(all_hourly, columns = ['CITY'], drop_first=True)

#### All cities RandomForest

In [20]:
# set up X and Y
Xall = all_hourly.drop(columns=drop_columns2).fillna(0)
yall = all_hourly['DECADE']

Xall_train, Xall_test, yall_train, yall_test = train_test_split(Xall,yall)

sc = StandardScaler()
Xall_train_sc = sc.fit_transform(Xall_train)
Xall_test_sc = sc.transform(Xall_test)

In [21]:
# instantiate regressor
rf = RandomForestClassifier()

rf.fit(Xall_train_sc, yall_train)

RandomForestClassifier()

In [23]:
print(f"Combined cities RandomForest train score: {rf.score(Xall_train_sc, yall_train)}")
print(f"Combined cities RandomForest test score: {rf.score(Xall_test_sc, yall_test)}")

Combined cities RandomForest train score: 0.999993575910521
Combined cities RandomForest test score: 0.8085109458337415


In [24]:
all_preds = rf.predict(Xall_test_sc)

In [31]:
from sklearn.metrics import multilabel_confusion_matrix

mcm = multilabel_confusion_matrix(all_preds,yall_test)

decades_list = [1940,1950,1960,1970,1980,1990,2000,2010,2020]
for i in range(len(mcm)):
    print(f"*******{decades_list[i]}*******")
    print(mcm[i])
    print('******************')


array([[[599525,  11026],
        [  1348,  10758]],

       [[507641,   5357],
        [ 27345,  82314]],

       [[554482,  15697],
        [ 10288,  42190]],

       [[548079,  18495],
        [  8967,  47116]],

       [[515951,  15868],
        [ 19978,  70860]],

       [[514634,  17843],
        [ 17436,  72744]],

       [[521910,  27559],
        [  9549,  63639]],

       [[477175,   7373],
        [ 24321, 113788]],

       [[622627,     14],
        [     0,     16]]], dtype=int64)

In [27]:
sum1=0
sum2=0
sum3=0
sum4=0

for i in range(len(mcm)):
    sum1 += mcm[i][0][0]
    sum2 += mcm[i][0][1]
    sum3 += mcm[i][1][0]
    sum4 += mcm[i][1][1]

total = sum1 + sum2 + sum3 + sum4
print(f"True positives: {sum1/total}")
print(f"False positives: {sum2/total}")
print(f"False negatives: {sum3/total}")
print(f"True negatives: {sum4/total}")

True positives: 0.8676123273148602
False positives: 0.02127656157402872
False negatives: 0.02127656157402872
True negatives: 0.08983454953708239


In [29]:
decades_list = [1940,1950,1960,1970,1980,1990,2000,2010,2020]
for i in range(len(mcm)):
    print(f"*******{decades_list[i]}*******")
    print(mcm[i])
    print('******************')


*******1940*******
[[599525   1348]
 [ 11026  10758]]
******************
*******1950*******
[[507641  27345]
 [  5357  82314]]
******************
*******1960*******
[[554482  10288]
 [ 15697  42190]]
******************
*******1970*******
[[548079   8967]
 [ 18495  47116]]
******************
*******1980*******
[[515951  19978]
 [ 15868  70860]]
******************
*******1990*******
[[514634  17436]
 [ 17843  72744]]
******************
*******2000*******
[[521910   9549]
 [ 27559  63639]]
******************
*******2010*******
[[477175  24321]
 [  7373 113788]]
******************
*******2020*******
[[622627      0]
 [    14     16]]
******************
