In [85]:
import pandas as pd
import sklearn
import numpy as np
from sklearn.inspection import permutation_importance
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.preprocessing import MinMaxScaler
import copy
import time
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.tree import export_graphviz

## Load train test from csv

In [86]:
train = pd.read_csv(r'weather_data/df_full_train.csv')
test = pd.read_csv(r'weather_data/df_full_test.csv')

In [87]:
REL_FEATURES = [' _conds', ' _dewptm', ' _fog', ' _hail', ' _hum', ' _pressurem', ' _rain', ' _snow',
                ' _thunder', ' _tornado', ' _vism', ' _wspdm', 'year', 'hour_cos', 'hour_sin', 'month_cos',
                'month_sin', 'week_cos', 'week_sin', 'night', 'morning', 'noon', 'evening']
LABEL = ['Temp']

In [88]:
X_train = train[REL_FEATURES]
y_train = train[LABEL]

In [89]:
X_test = test[REL_FEATURES]
y_test = test[LABEL]

In [90]:
df = pd.get_dummies(pd.concat([X_train, X_test], ignore_index=True)).fillna(0)

In [91]:
X_train = df.loc[X_train.index]
X_test = df.loc[X_test.index]

# Random Forest full data

In [92]:
clf = RandomForestClassifier(n_estimators=100, random_state=1, max_depth=2)
clf.fit(X_train, y_train)

  


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

## Random forest full train pred

In [53]:
# pred on train
y_pred_train = clf.predict(X_train)

In [54]:
# accuracy on train
clf.score(X_train, y_train)

0.2950866433716038

In [55]:
mean_absolute_error(y_train, y_pred_train)

1.1049704057619625

In [56]:
mean_squared_error(y_train, y_pred_train)

2.1645154389217716

## Random forest full test pred

In [57]:
# pred on test
y_pred_test = clf.predict(X_test)

In [58]:
# accuracy on test
clf.score(X_test, y_test)

0.11666666666666667

In [59]:
mean_squared_error(y_test, y_pred_test)

11.74375

# Random Forest day parts data

In [70]:
for part in ['night', 'morning', 'noon', 'evening']:
    print(fr'{part} data')
    
    ## Load train test from csv

    train_part = pd.read_csv(fr'weather_data/df_{part}_train.csv')
    test_part = pd.read_csv(fr'weather_data/df_{part}_test.csv')

    X_train_part = train_part[REL_FEATURES]
    y_train_part = train_part[LABEL]

    X_test_part = test_part[REL_FEATURES]
    y_test_part = test_part[LABEL]

    df = pd.get_dummies(pd.concat([X_train_part, X_test_part], ignore_index=True)).fillna(0)

    X_train_part = df.loc[X_train_part.index]
    X_test_part = df.loc[X_test_part.index]

    clf = RandomForestClassifier(n_estimators=100, random_state=1, max_depth=2)
    clf.fit(X_train_part, y_train_part)

    ## Random forest train pred

    # pred on train
    y_pred_train_part = clf.predict(X_train_part)

    # accuracy on train
    print(clf.score(X_train_part, y_train_part))

    print(mean_squared_error(y_train_part, y_pred_train_part))

    ## Random forest test pred

    # pred on test
    y_pred_test_part = clf.predict(X_test_part)

    # accuracy on test
    print(clf.score(X_test_part, y_test_part))

    print(mean_squared_error(y_test_part, y_pred_test_part))

night data




0.43664772727272727
1.2167613636363637
0.15342465753424658
16.56986301369863
morning data




0.4560752994865944
1.4386765544780376
0.0881542699724518
8.96418732782369
noon data




0.33513821601595895
1.5531490453120547
0.08913649025069638
9.682451253481894
evening data




0.37155963302752293
1.3176605504587156
0.16147308781869688
12.070821529745043


# Random Forest months data

In [74]:
for m in range(1, 13):
    print(fr'month {m} data')
    
    ## Load train test from csv

    train_part = pd.read_csv(fr'weather_data/df_month_{m}_train.csv')
    test_part = pd.read_csv(fr'weather_data/df_month_{m}_test.csv')

    X_train_part = train_part[REL_FEATURES]
    y_train_part = train_part[LABEL]

    X_test_part = test_part[REL_FEATURES]
    y_test_part = test_part[LABEL]

    df = pd.get_dummies(pd.concat([X_train_part, X_test_part], ignore_index=True)).fillna(0)

    X_train_part = df.loc[X_train_part.index]
    X_test_part = df.loc[X_test_part.index]

    clf = RandomForestClassifier(n_estimators=100, random_state=1, max_depth=2)
    clf.fit(X_train_part, y_train_part)

    ## Random forest train pred

    # pred on train
    y_pred_train_part = clf.predict(X_train_part)

    # accuracy on train
    print(clf.score(X_train_part, y_train_part))

    print(mean_squared_error(y_train_part, y_pred_train_part))

    ## Random forest test pred

    # pred on test
    y_pred_test_part = clf.predict(X_test_part)

    # accuracy on test
    print(clf.score(X_test_part, y_test_part))

    print(mean_squared_error(y_test_part, y_pred_test_part))

month 1 data




0.47483766233766234
0.9943181818181818
0.3629032258064516
1.596774193548387
month 2 data




0.38447488584474887
1.1433789954337898
0.2857142857142857
3.9642857142857144
month 3 data




0.35978358881875566
1.3985572587917043
0.25
5.362903225806452
month 4 data




0.44609665427509293
1.095724907063197
0.15254237288135594
6.203389830508475
month 5 data




0.4685251798561151
1.29136690647482
0.31092436974789917
3.7815126050420167
month 6 data




0.590633130962706
0.6513443191673894
0.3
2.941666666666667
month 7 data




0.5380875202593193
0.6150729335494327
0.2727272727272727
1.5950413223140496
month 8 data




0.6504770164787511
0.37294015611448394
0.3220338983050847
1.347457627118644
month 9 data




0.5507976490344249
0.5583543240973972
0.3445378151260504
1.2521008403361344
month 10 data




0.4560975609756098
0.8
0.32786885245901637
3.918032786885246
month 11 data




0.38083333333333336
1.0783333333333334
0.39166666666666666
1.2083333333333333
month 12 data




0.4168012924071082
0.9297253634894992
0.23577235772357724
4.154471544715447


# Random Forest seasons data

In [98]:
for s in ['winter', 'spring', 'summer', 'monsoon', 'autumn']:
    print(fr'season {s} data')
    
    ## Load train test from csv

    train_part = pd.read_csv(fr'weather_data/df_season_{s}_train.csv')
    test_part = pd.read_csv(fr'weather_data/df_season_{s}_test.csv')

    X_train_part = train_part[REL_FEATURES]
    y_train_part = train_part[LABEL]

    X_test_part = test_part[REL_FEATURES]
    y_test_part = test_part[LABEL]

    df = pd.get_dummies(pd.concat([X_train_part, X_test_part], ignore_index=True)).fillna(0)

    X_train_part = df.loc[X_train_part.index]
    X_test_part = df.loc[X_test_part.index]

    clf = RandomForestClassifier(n_estimators=100, random_state=1, max_depth=2)
    clf.fit(X_train_part, y_train_part)

    ## Random forest train pred

    # pred on train
    y_pred_train_part = clf.predict(X_train_part)

    # accuracy on train
    print(clf.score(X_train_part, y_train_part))

    print(mean_squared_error(y_train_part, y_pred_train_part))

    ## Random forest test pred

    # pred on test
    y_pred_test_part = clf.predict(X_test_part)

    # accuracy on test
    print(clf.score(X_test_part, y_test_part))

    print(mean_squared_error(y_test_part, y_pred_test_part))

season winter data




0.49190283400809715
0.917004048582996
0.3076923076923077
3.3927125506072873
season spring data




0.3561705989110708
1.5040834845735027
0.2457627118644068
6.970338983050848
season summer data




0.40915893445076323
1.5483388207123616
0.3557422969187675
2.719887955182073
season monsoon data




0.5781879194630872
0.5359060402684563
0.3210702341137124
1.5250836120401339
season autumn data




0.40785997357992076
0.9815059445178336
0.27906976744186046
3.159468438538206


# Random Forest months + day part data

In [106]:
for m in range(1, 13):
    for p in ['night', 'morning', 'noon', 'evening']:
        print(fr'month {m} day part {p} data')

        ## Load train test from csv

        train_part = pd.read_csv(fr'weather_data/df_month_{m}_train.csv')
        test_part = pd.read_csv(fr'weather_data/df_month_{m}_test.csv')
        
        train_part = train_part[train_part[p] == 1].reset_index()
        test_part = test_part[test_part[p] == 1].reset_index()

        X_train_part = train_part[REL_FEATURES]
        y_train_part = train_part[LABEL]

        X_test_part = test_part[REL_FEATURES]
        y_test_part = test_part[LABEL]

        df = pd.get_dummies(pd.concat([X_train_part, X_test_part], ignore_index=True)).fillna(0)

        X_train_part = df.loc[X_train_part.index]
        X_test_part = df.loc[X_test_part.index]

        clf = RandomForestClassifier(n_estimators=100, random_state=1, max_depth=2)
        clf.fit(X_train_part, y_train_part)

        ## Random forest train pred

        # pred on train
        y_pred_train_part = clf.predict(X_train_part)

        # accuracy on train
        print(clf.score(X_train_part, y_train_part))

        print(mean_squared_error(y_train_part, y_pred_train_part))

        ## Random forest test pred

        # pred on test
        y_pred_test_part = clf.predict(X_test_part)

        # accuracy on test
        print(clf.score(X_test_part, y_test_part))

        print(mean_squared_error(y_test_part, y_pred_test_part))

month 1 day part night data




0.6148867313915858
0.8090614886731392
0.5161290322580645
0.6774193548387096
month 1 day part morning data




0.4642857142857143
0.8766233766233766
0.12903225806451613
2.774193548387097
month 1 day part noon data




0.5439739413680782
0.6840390879478827
0.25806451612903225
2.161290322580645
month 1 day part evening data




0.6623376623376623
0.4448051948051948
0.5806451612903226
0.6129032258064516
month 2 day part night data




0.5543478260869565
0.7536231884057971
0.42857142857142855
1.7857142857142858
month 2 day part morning data




0.48717948717948717
0.9853479853479854
0.2857142857142857
3.0
month 2 day part noon data




0.47619047619047616
0.7875457875457875
0.2857142857142857
1.8571428571428572
month 2 day part evening data




0.5421245421245421
0.6556776556776557
0.42857142857142855
1.4285714285714286
month 3 day part night data




0.5519713261648745
0.7741935483870968
0.3225806451612903
1.1612903225806452
month 3 day part morning data




0.4659498207885305
1.093189964157706
0.3225806451612903
0.967741935483871
month 3 day part noon data




0.45126353790613716
1.0902527075812274
0.3870967741935484
1.5161290322580645
month 3 day part evening data




0.5255474452554745
0.6788321167883211
0.45161290322580644
0.8387096774193549
month 4 day part night data




0.5481481481481482
0.7185185185185186
0.3
2.1666666666666665
month 4 day part morning data




0.5185185185185185
1.3148148148148149
0.5517241379310345
1.3793103448275863
month 4 day part noon data




0.5259259259259259
1.162962962962963
0.3103448275862069
1.3103448275862069
month 4 day part evening data




0.4849624060150376
0.8909774436090225
0.26666666666666666
1.6
month 5 day part night data




0.5071942446043165
0.6546762589928058
0.5483870967741935
0.9354838709677419
month 5 day part morning data




0.6402877697841727
1.370503597122302
0.8709677419354839
0.12903225806451613
month 5 day part noon data




0.5627240143369175
1.4516129032258065
0.5666666666666667
1.0
month 5 day part evening data




0.5054151624548736
0.8050541516245487
0.3333333333333333
1.3333333333333333
month 6 day part night data




0.563573883161512
0.5910652920962199
0.36666666666666664
1.4666666666666666
month 6 day part morning data




0.6236933797909407
0.9512195121951219
0.6333333333333333
0.9666666666666667
month 6 day part noon data




0.5958188153310104
0.759581881533101
0.36666666666666664
2.033333333333333
month 6 day part evening data




0.5520833333333334
0.6840277777777778
0.3
1.3666666666666667
month 7 day part night data




0.6741935483870968
0.33548387096774196
0.4838709677419355
0.5161290322580645
month 7 day part morning data




0.6310679611650486
0.5275080906148867
0.5
1.2333333333333334
month 7 day part noon data




0.5612903225806452
0.5258064516129032
0.36666666666666664
1.4
month 7 day part evening data




0.6131147540983607
0.46557377049180326
0.4
1.0
month 8 day part night data




0.740484429065744
0.2698961937716263
0.4838709677419355
0.5161290322580645
month 8 day part morning data




0.6816608996539792
0.4429065743944637
0.45161290322580644
1.1290322580645162
month 8 day part noon data




0.6241379310344828
0.4379310344827586
0.42857142857142855
0.8928571428571429
month 8 day part evening data




0.631578947368421
0.37894736842105264
0.5714285714285714
0.42857142857142855
month 9 day part night data




0.6588628762541806
0.41471571906354515
0.6
0.4
month 9 day part morning data




0.6148648648648649
0.5878378378378378
0.43333333333333335
1.3666666666666667
month 9 day part noon data




0.587248322147651
0.5033557046979866
0.23333333333333334
1.7333333333333334
month 9 day part evening data




0.7248322147651006
0.31543624161073824
0.2413793103448276
0.8620689655172413
month 10 day part night data




0.5857605177993528
0.6019417475728155
0.2903225806451613
0.7096774193548387
month 10 day part morning data




0.5895765472312704
0.46905537459283386
0.5161290322580645
0.4838709677419355
month 10 day part noon data




0.5811688311688312
0.525974025974026
0.41935483870967744
1.7419354838709677
month 10 day part evening data




0.6568627450980392
0.43137254901960786
0.4482758620689655
0.8620689655172413
month 11 day part night data




0.5833333333333334
0.5666666666666667
0.7
0.8
month 11 day part morning data




0.5733333333333334
0.5166666666666667
0.5333333333333333
0.9333333333333333
month 11 day part noon data




0.5133333333333333
0.8
0.3
1.4
month 11 day part evening data




0.63
0.45
0.7333333333333333
0.36666666666666664
month 12 day part night data




0.5516129032258065
0.7419354838709677
0.4838709677419355
0.6129032258064516
month 12 day part morning data




0.5580645161290323
0.7774193548387097
0.45161290322580644
1.032258064516129
month 12 day part noon data




0.4967741935483871
0.7903225806451613
0.45161290322580644
2.193548387096774
month 12 day part evening data




0.5616883116883117
0.5064935064935064
0.36666666666666664
0.7333333333333333


# Random Forest seasons + day part data

In [107]:
for s in ['winter', 'spring', 'summer', 'monsoon', 'autumn']:
    for p in ['night', 'morning', 'noon', 'evening']:
        print(fr'season {s} day part {p} data')

        ## Load train test from csv

        train_part = pd.read_csv(fr'weather_data/df_season_{s}_train.csv')
        test_part = pd.read_csv(fr'weather_data/df_season_{s}_test.csv')
        
        train_part = train_part[train_part[p] == 1].reset_index()
        test_part = test_part[test_part[p] == 1].reset_index()

        X_train_part = train_part[REL_FEATURES]
        y_train_part = train_part[LABEL]

        X_test_part = test_part[REL_FEATURES]
        y_test_part = test_part[LABEL]

        df = pd.get_dummies(pd.concat([X_train_part, X_test_part], ignore_index=True)).fillna(0)

        X_train_part = df.loc[X_train_part.index]
        X_test_part = df.loc[X_test_part.index]

        clf = RandomForestClassifier(n_estimators=100, random_state=1, max_depth=2)
        clf.fit(X_train_part, y_train_part)

        ## Random forest train pred

        # pred on train
        y_pred_train_part = clf.predict(X_train_part)

        # accuracy on train
        print(clf.score(X_train_part, y_train_part))

        print(mean_squared_error(y_train_part, y_pred_train_part))

        ## Random forest test pred

        # pred on test
        y_pred_test_part = clf.predict(X_test_part)

        # accuracy on test
        print(clf.score(X_test_part, y_test_part))

        print(mean_squared_error(y_test_part, y_pred_test_part))

season winter day part night data




0.617124394184168
0.7334410339256866
0.5161290322580645
0.6774193548387096
season winter day part morning data




0.4449838187702265
1.0339805825242718
0.3064516129032258
2.274193548387097
season winter day part noon data




0.5121555915721232
0.713128038897893
0.3870967741935484
1.7580645161290323
season winter day part evening data




0.5292207792207793
0.5811688311688312
0.47540983606557374
0.6229508196721312
season spring day part night data




0.4666666666666667
0.9837837837837838
0.2542372881355932
3.0847457627118646
season spring day part morning data




0.4003623188405797
1.3315217391304348
0.1694915254237288
2.9152542372881354
season spring day part noon data




0.4309090909090909
1.2381818181818183
0.2542372881355932
3.23728813559322
season spring day part evening data




0.4625228519195612
0.9177330895795247
0.2033898305084746
3.0
season summer day part night data




0.45530393325387364
0.9892729439809297
0.27472527472527475
1.7252747252747254
season summer day part morning data




0.5473053892215569
1.4934131736526945
0.7888888888888889
0.4111111111111111
season summer day part noon data




0.4820574162679426
1.388755980861244
0.4044943820224719
1.4606741573033708
season summer day part evening data




0.4620938628158845
0.8796630565583634
0.28735632183908044
1.4482758620689655
season monsoon day part night data




0.6537433155080213
0.3622994652406417
0.44155844155844154
0.5584415584415584
season monsoon day part morning data




0.6327077747989276
0.46648793565683644
0.39473684210526316
1.25
season monsoon day part noon data




0.4839572192513369
0.7219251336898396
0.3287671232876712
1.273972602739726
season monsoon day part evening data




0.5338753387533876
0.6531165311653117
0.4520547945205479
0.6301369863013698
season autumn day part night data




0.5019762845849802
0.7957839262187089
0.5
0.6842105263157895
season autumn day part morning data




0.5231788079470199
0.6291390728476821
0.618421052631579
0.7894736842105263
season autumn day part noon data




0.39841688654353563
1.0303430079155673
0.34210526315789475
1.144736842105263
season autumn day part evening data




0.5396825396825397
0.5317460317460317
0.5753424657534246
0.589041095890411
