In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


train_weather = pd.read_csv('train_weather_week_lean.csv')

In [2]:
train_weather.drop(['Unnamed: 0', 'start date', 'Week_Number','mean_Tavg','Sunrise','Sunset'], axis=1, inplace=True)

In [3]:
train_weather.columns

Index([u'Mosquitos', u'WNV_Present', u'mean_Tmax', u'mean_Tmin',
       u'mean_Depart', u'mean_DewPoint', u'mean_WetBulb', u'mean_Heat',
       u'mean_Cool', u'mean_PrecipTotal', u'mean_StnPressure',
       u'mean_SeaLevel', u'mean_ResultSpeed', u'mean_ResultDir',
       u'mean_AvgSpeed', u'mean_Tdiff', u'Thunder_Storms', u'Hail', u'Rain',
       u'Drizzle', u'Snow', u'Fog', u'Mist', u'Haze', u'Fume', u'Squall',
       u'Shallow', u'Patches', u'Vicinity'],
      dtype='object')

In [4]:
train_weather.head()

Unnamed: 0,Mosquitos,WNV_Present,mean_Tmax,mean_Tmin,mean_Depart,mean_DewPoint,mean_WetBulb,mean_Heat,mean_Cool,mean_PrecipTotal,...,Drizzle,Snow,Fog,Mist,Haze,Fume,Squall,Shallow,Patches,Vicinity
0,11,0,67.5,54.166667,6.666667,47.5,54.0,4.333333,0.166667,0.226667,...,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
1,710,6,66.071429,47.261905,1.833333,42.238095,49.535714,9.047619,0.630952,0.094702,...,0.035714,0.0,0.011905,0.261905,0.095238,0.0,0.0,0.0,0.0,0.011905
2,800,5,69.723214,49.053571,2.571429,44.428571,51.794643,6.598214,1.223214,0.162009,...,0.053571,0.0,0.035714,0.339286,0.133929,0.0,0.0,0.0,0.0,0.0
3,656,9,70.928571,49.607143,1.053571,44.553571,52.366071,6.517857,2.053571,0.089643,...,0.071429,0.0,0.026786,0.3125,0.089286,0.0,0.0,0.0,0.0,0.017857
4,945,9,74.741071,54.321429,2.517857,48.294643,55.830357,4.1875,3.991071,0.156161,...,0.008929,0.0,0.017857,0.205357,0.053571,0.0,0.0,0.0,0.0,0.0


Below we are running a random forest on X as all the columns except WNV Present and Mosquitos and y being WNV Present.
The number of estimators is 50, the max features being considered is 25, random state is true, as is warm start

In [90]:
X = train_weather.drop(['WNV_Present', 'Mosquitos'],axis=1)
y = train_weather['WNV_Present']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.33)

rf_train_weather = RandomForestClassifier(n_estimators=20)

rf_train_weather.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=20, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [91]:
y_pred = rf_train_weather.predict(X_test)
print confusion_matrix(y_test,y_pred) 


[[1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 1 1 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0]
 [0 0 0 2 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0]]


In [92]:
print accuracy_score(y_pred, y_test) 

0.3


In [93]:
print classification_report(y_test,y_pred)

             precision    recall  f1-score   support

          0       1.00      1.00      1.00         1
          3       0.00      0.00      0.00         0
          5       1.00      0.50      0.67         2
          6       0.00      0.00      0.00         1
          7       0.00      0.00      0.00         1
          8       0.00      0.00      0.00         0
          9       1.00      1.00      1.00         1
         10       0.00      0.00      0.00         2
         12       0.00      0.00      0.00         1
         21       0.00      0.00      0.00         1

avg / total       0.40      0.30      0.33        10



In [94]:
rf_features = pd.DataFrame(rf_train_weather.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance',
                                                                        ascending=False)
rf_features.head(10)

Unnamed: 0,importance
mean_StnPressure,0.095398
mean_SeaLevel,0.089327
Thunder_Storms,0.075632
mean_Cool,0.074072
Mist,0.071153
mean_Tmax,0.065085
Rain,0.053913
Drizzle,0.053206
Haze,0.047207
mean_PrecipTotal,0.043867


Below we are runnning a random forest classifier with X as all the features in the dataframe excluding WNV Present and
Mosquitos and y as Mosquitos. 

In [101]:
X = train_weather.drop(['WNV_Present', 'Mosquitos'],axis=1)
y = train_weather['Mosquitos']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.33)

rf_train_weather = RandomForestClassifier()

rf_train_weather.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [102]:
print accuracy_score(y_pred, y_test) 

0.0


In [103]:
y_pred = rf_train_weather.predict(X_test)
print confusion_matrix(y_test,y_pred) 

[[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [104]:
print classification_report(y_test,y_pred)

             precision    recall  f1-score   support

        355       0.00      0.00      0.00         1
        656       0.00      0.00      0.00         1
        699       0.00      0.00      0.00         1
        710       0.00      0.00      0.00         0
        923       0.00      0.00      0.00         0
       1303       0.00      0.00      0.00         1
       1545       0.00      0.00      0.00         1
       1758       0.00      0.00      0.00         1
       1813       0.00      0.00      0.00         1
       1870       0.00      0.00      0.00         0
       1944       0.00      0.00      0.00         0
       2154       0.00      0.00      0.00         1
       2420       0.00      0.00      0.00         0
       2468       0.00      0.00      0.00         0
       2478       0.00      0.00      0.00         1
       2606       0.00      0.00      0.00         1
       2960       0.00      0.00      0.00         0

avg / total       0.00      0.00      0.00  

In [105]:
rf_features = pd.DataFrame(rf_train_weather.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance',
                                                                        ascending=False)
rf_features.head(10)

Unnamed: 0,importance
mean_ResultSpeed,0.087684
Thunder_Storms,0.067614
mean_WetBulb,0.066857
mean_SeaLevel,0.065621
Fog,0.053009
mean_Tmax,0.04685
mean_Cool,0.04631
mean_Tdiff,0.045701
mean_Tmin,0.045104
Drizzle,0.044563


In [108]:
X = train_weather.drop(['Mosquitos','WNV_Present'],axis=1)
y = train_weather['Mosquitos']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.33)

rfr_train_weather= RandomForestRegressor(n_estimators=50)

rfr_train_weather.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [109]:
rfr_train_weather.score(X_test, y_test)


0.55248831274060883

In [110]:
rfr_features = pd.DataFrame(rfr_train_weather.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance',
                                                                        ascending=False)
rfr_features.head(10)

Unnamed: 0,importance
mean_AvgSpeed,0.343654
mean_ResultSpeed,0.141354
mean_StnPressure,0.084518
Rain,0.070337
mean_ResultDir,0.061133
mean_Tdiff,0.057621
mean_Depart,0.04024
Haze,0.038824
Mist,0.024759
mean_Heat,0.020872


In [66]:
X = train_weather.drop(['Mosquitos'],axis=1)
y = train_weather['Mosquitos']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.33)

rfr_train_weather = RandomForestRegressor(max_features=28,n_estimators=10)

rfr_train_weather.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=28, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [65]:
rfr_train_weather.score(X_test, y_test)


0.21146905462471766

In [69]:
X = train_weather.drop(['WNV_Present'],axis=1)
y = train_weather['WNV_Present']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.33)

rfr_train_weather = RandomForestRegressor()

rfr_train_weather.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [70]:
rfr_train_weather.score(X_test, y_test)

0.03529243937232529