In [81]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
train_df = pd.read_csv('./kaggle_data/train.csv')
weather_df = pd.read_csv('./kaggle_data/weather.csv')
spray_df = pd.read_csv('./kaggle_data/spray.csv')

In [3]:
train_df.head()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,0
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,0


In [4]:
weather_df.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,...,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,...,,0,M,0.0,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,...,,M,M,M,0.0,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51,-3,42,47,14,0,...,BR,0,M,0.0,0.0,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52,M,42,47,13,0,...,BR HZ,M,M,M,0.0,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56,2,40,48,9,0,...,,0,M,0.0,0.0,29.39,30.12,11.7,7,11.9


Need to better understand `spray_df` and identify where weather stations 1 and 2 are.  All dataframes include date data, so the next step is to figure out how to appropriately combine them.

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [6]:
#sns.heatmap(train_df.corr(), annot=True)

Unsurprisingly, the likelihood of WNV increases as the number of mosquitos increase.

In [7]:
#print(train_df.shape)
#train_df.isnull().sum()

In [8]:
#print(spray_df.shape)
#spray_df.isnull().sum()
                 ## We will need to deal with these time nulls, but it may make sense to drop the time column
                  # since the other dfs dont have time

In [9]:
#print(spray_df.shape)
#weather_df.isnull().sum()

In [10]:
spray_df.drop('Time', axis = 1, inplace = True) # dropping time from spray data because it is not in any other dfs

In [11]:
#spray_df.head()

In [12]:
daily_weather = weather_df[weather_df['Station'] == 1] # creating weather data df using only one station

In [13]:
daily_weather.drop('Station', axis= 1, inplace=True) # dropping station label since all are station 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [14]:
daily_weather.reset_index(inplace=True, drop=True) 

daily_weather['Date'] = pd.to_datetime(daily_weather['Date']) 

daily_weather.set_index('Date',inplace=True, drop=True) # datetime index on weather data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [15]:
train_df['Date'] = pd.to_datetime(train_df['Date']) #datetime index on training data

train_df.set_index('Date', inplace=True, drop=True)

In [16]:
new_df = pd.merge(train_df, 
                  daily_weather, 
                  left_on = train_df.index, right_on = daily_weather.index) #new_df = combined weather and train data

In [17]:
#new_df.shape, train_df.shape, weather_df.shape

In [18]:
new_df['key_0'] = pd.to_datetime(new_df['key_0'])

new_df.set_index('key_0', inplace=True, drop=True)

new_df.index.rename('Date', inplace=True) #assiging date as index of new_df 

In [19]:
new_df.drop(['Water1','SnowFall'], axis=1, inplace=True) # dropping these columns since they provided no info

In [20]:
mode_precip = float(new_df[new_df['PrecipTotal'] != '  T'].StnPressure.mode()[0]) # storing mode precip value for replacement in next step

In [21]:
precip_totals = []
for total in new_df.PrecipTotal:
    if total == '  T':
        precip_totals.append(mode_precip)
    else:
        precip_totals.append(total)

new_df.PrecipTotal = pd.to_numeric(precip_totals) # replacing '  T' with mode precip value

In [22]:
mode_pressure = new_df[new_df['StnPressure'] != 'M'].StnPressure.mode() # storing mode pressure for replacement in next step

In [23]:
pressures = []
for pressure in new_df.StnPressure:
    if pressure == 'M':
        pressures.append(mode_pressure)
    else:
        pressures.append(pressure)
pressures = [float(pressure) for pressure in pressures]

new_df.StnPressure = pd.to_numeric(pressures) # replacing 'M' with mode pressure value

In [24]:
pressures = [float(pressure) for pressure in pressures] # converting strings to floats

In [25]:
#new_df.dtypes
cols_to_change = ['Tavg',
                 'Depart',
                 'Cool',
                 'Sunrise',
                 'Sunset',
                 'Depth',
                 'PrecipTotal',
                 'StnPressure',
                 'SeaLevel',
                 'AvgSpeed'
                 ] # columns of type object that can be coerced to numeric values

In [26]:
for col in cols_to_change:
    new_df[col] = pd.to_numeric(new_df[col])#changing columns above to numeric

In [27]:
wnv1_df = new_df[new_df['WnvPresent'] == 1]

In [28]:
wnv0_df = new_df[new_df['WnvPresent'] == 0].sample(n = wnv1_df.shape[0], random_state = 42)

In [29]:
df = pd.concat([wnv1_df, wnv0_df], axis = 0)

In [30]:
num_df = df._get_numeric_data()

In [31]:
X = num_df.drop('WnvPresent', axis = 1)
y = num_df.WnvPresent

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .25)

In [33]:
ss = StandardScaler()

In [34]:
ss.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [35]:
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

In [36]:
pca = PCA()

In [37]:
pca.fit(X_train)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [38]:
Z_train = pca.transform(X_train)
Z_test = pca.transform(X_test)

In [39]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

## Random Forrest on Numerical Data

In [40]:
rf = RandomForestClassifier(n_jobs=4)
rf_params = {'n_estimators':[8,10,12,14,20],
            'min_samples_split':[2,3,4],
            'random_state':[1,21,31,100]}
rf_gs = GridSearchCV(rf, rf_params, n_jobs=4)
rf_gs.fit(Z_train, y_train)
rf_gs.best_params_

{'min_samples_split': 4, 'n_estimators': 20, 'random_state': 31}

In [41]:
rf_preds = rf_gs.predict(Z_test)

In [42]:
precision_score(y_test, rf_preds), accuracy_score(y_test, rf_preds)

(0.7702702702702703, 0.7427536231884058)

## AdaBoost on Numerical Data

In [43]:
ada = AdaBoostClassifier()
ada_params = {'n_estimators':[30,40,50,60,80],
             'learning_rate':[.8,.4,.1],}
ada_gs = GridSearchCV(ada, ada_params, n_jobs=4)
ada_gs.fit(Z_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'n_estimators': [30, 40, 50, 60, 80], 'learning_rate': [0.8, 0.4, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [44]:
ada_preds = ada_gs.predict(Z_test)

In [45]:
ada_gs.best_params_

{'learning_rate': 0.4, 'n_estimators': 60}

In [46]:
precision_score(y_test, ada_preds), accuracy_score(y_test, ada_preds)

(0.7283950617283951, 0.7210144927536232)

## Gradient Boost on Numerical Data

In [76]:
xgb = GradientBoostingClassifier()
xgb_params = {'learning_rate':[.01, .05, .1, .3, .5],
             'n_estimators':[50,80, 100, 120],
             'min_samples_split':[2,3,4]}
xgb_gs = GridSearchCV(xgb, xgb_params, n_jobs=4)
xgb_gs.fit(Z_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'learning_rate': [0.01, 0.05, 0.1, 0.3, 0.5], 'n_estimators': [50, 80, 100, 120], 'min_samples_split': [2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [77]:
xgb_gs.best_params_

{'learning_rate': 0.05, 'min_samples_split': 4, 'n_estimators': 100}

In [78]:
xgb_preds = xgb_gs.predict(Z_test)

In [79]:
precision_score(y_test, xgb_preds), accuracy_score(y_test, xgb_preds)

(0.75, 0.7427536231884058)

## Trying same three models on dummied data

In [51]:
df.columns

Index(['Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'NumMosquitos', 'WnvPresent', 'Tmax', 'Tmin', 'Tavg', 'Depart',
       'DewPoint', 'WetBulb', 'Heat', 'Cool', 'Sunrise', 'Sunset', 'CodeSum',
       'Depth', 'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed',
       'ResultDir', 'AvgSpeed'],
      dtype='object')

In [52]:
dummies_df = pd.get_dummies(df.drop(['Address','Street'], axis = 1))

In [53]:
X_dum = dummies_df.drop('WnvPresent', axis = 1)
y = dummies_df.WnvPresent

In [54]:
Xdum_train, Xdum_test, ydum_train, ydum_test = train_test_split(X_dum, y, test_size = .25)

In [55]:
ss_dum = StandardScaler()
ss_dum.fit(Xdum_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [56]:
Xdum_train = ss_dum.transform(Xdum_train)
Xdum_test = ss_dum.transform(Xdum_test)

In [57]:
pca_dum = PCA(.95)

In [58]:
pca_dum.fit(Xdum_train)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [59]:
Zdum_train = pca_dum.transform(Xdum_train)
Zdum_test = pca_dum.transform(Xdum_test)

## Random Forrest on Dummy Data
### Random forest had highest accuracy of 78.6% but lower precision than AdaBoost using just numerical data

In [60]:
rf_dum = RandomForestClassifier(n_jobs=4)
rf_params = {'n_estimators':[8,10,12,14,20],
            'min_samples_split':[2,3,4],
            'random_state':[1,21,31,100]}
rf_dum_gs = GridSearchCV(rf_dum, rf_params, n_jobs=4)
rf_dum_gs.fit(Zdum_train, ydum_train)
rf_dum_gs.best_params_

{'min_samples_split': 2, 'n_estimators': 20, 'random_state': 1}

In [61]:
rf_dum_preds = rf_dum_gs.predict(Zdum_test)

In [62]:
precision_score(ydum_test, rf_dum_preds), accuracy_score(ydum_test, rf_dum_preds)

(0.6910569105691057, 0.6666666666666666)

## AdaBoost on Dummy Data

In [63]:
ada_dum = AdaBoostClassifier()
ada_dum_params = {'n_estimators':[30,40,50,60,80],
             'learning_rate':[.8,.4,.1],}
ada_dum_gs = GridSearchCV(ada_dum, ada_params, n_jobs=4)
ada_dum_gs.fit(Zdum_train, ydum_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'n_estimators': [30, 40, 50, 60, 80], 'learning_rate': [0.8, 0.4, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [64]:
ada_dum_preds = ada_dum_gs.predict(Zdum_test)

In [65]:
ada_dum_gs.best_params_

{'learning_rate': 0.8, 'n_estimators': 40}

In [66]:
precision_score(ydum_test, ada_dum_preds), accuracy_score(ydum_test, ada_dum_preds)

(0.6808510638297872, 0.6811594202898551)

## Gradient Boost on Dummy Data

In [67]:
xgb_dum = GradientBoostingClassifier()
xgb_dum_params = {'learning_rate':[.01, .05, .1, .3, .5],
             'n_estimators':[50,80, 100, 120]}
xgb_dum_gs = GridSearchCV(xgb_dum, xgb_dum_params, n_jobs=4)
xgb_dum_gs.fit(Zdum_train, ydum_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'learning_rate': [0.01, 0.05, 0.1, 0.3, 0.5], 'n_estimators': [50, 80, 100, 120]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [68]:
xgb_dum_gs.best_params_

{'learning_rate': 0.05, 'n_estimators': 100}

In [69]:
xgb_dum_preds = xgb_dum_gs.predict(Zdum_test)

In [70]:
precision_score(ydum_test, xgb_dum_preds), accuracy_score(ydum_test, xgb_dum_preds)

(0.7007299270072993, 0.6956521739130435)

# Suzanne Pipeline

In [82]:
X = df.drop(columns = ['WnvPresent', 'Address', 
                           'Species', 'Street', 'Trap', 
                           'AddressNumberAndStreet', 'WetBulb'
                           , 'Heat', 'CodeSum'],
                            axis = 1)
y = df['WnvPresent']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size = 0.33, 
                                                   random_state = 42)

# Instantiating Preprocessing and Model 
lr = LogisticRegression()
ss = StandardScaler()

# Setting up Pipeline 

lr_pipe = Pipeline([
    ('ss', ss),
    ('lr', lr)
])

# Setting up Parameter Dictionary 
gs_lr_params = {
    'lr__penalty': ['l1', 'l2'], 
    'lr__C': [0.5, 1.0, 1.2]
}

# Instantiating and Fitting my Grid Search
gs_lr = GridSearchCV(lr_pipe, param_grid=gs_lr_params)
gs_lr.fit(X_train, y_train);

print("Best Params:", gs_lr.best_params_)
print("Best Train Score:", gs_lr.best_score_ )
print("Best Test Score:", gs_lr.score(X_test, y_test) )

Best Params: {'lr__C': 0.5, 'lr__penalty': 'l1'}
Best Train Score: 0.7330623306233063
Best Test Score: 0.7472527472527473
