In [15]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from transformers import *

p = '/Users/numlaut/dsi/projects/7.0 project 4/West_Nile_Kaggle/assets/input/'
spraydf =pd.read_csv(p+'spray.csv')
traindf =pd.read_csv(p+'train.csv')
coefdf = pd.read_csv(p+'train_w_coefs.csv')
ktestdf = pd.read_csv(p+'test.csv')
weatherdf = pd.read_csv(p+'weatherengineered.csv')

In [16]:
import time
def date_to_utc(x):
    return int(time.mktime(time.strptime(x,'%Y-%m-%d')))

def utc_to_date(x):
    return time.strftime('%Y-%m-%d',time.gmtime(x))

def submit(yhat,name='model'):
    submit = pd.DataFrame(yhat,columns=['WnvPresent'])
    submit.index +=1
    submit.index.name = 'Id'
    submit.to_csv(str('./results_'+name+'.csv'))
    return submit['WnvPresent'].value_counts()

In [17]:
print(traindf.columns)

Index(['Date', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'NumMosquitos', 'WnvPresent'],
      dtype='object')


In [18]:
# date_to_utc defined above
utc = ColumnMapper(func=date_to_utc,column='date',name='utc',drop=False) # keep 'date' column and utc

In [19]:
## building weatherdf ##
#selection station 1 data
#mask = (weatherdf['Station']==1)

# compile with renaming
weather = weatherdf.rename(columns={'Date':'date'})

#weather processing
pipe = []
pipe.append(('utc',utc))

#transform w/pipe
prep_weather = Pipeline(pipe)
weather = prep_weather.fit_transform(weather)

In [20]:
weather.shape

(1472, 86)

In [21]:
## building train df ##

# define renamed columns
rn = {}
rn['Date']='date'
rn['Latitude']='lat'
rn['Longitude']='long'
rn['AddressNumberAndStreet']='address'
rn['Species']='species'

# define selected columns
feats = 'date address species lat long'.split()

# define y
target = 'WnvPresent'.split()

# compile with renaming
train = traindf.rename(columns=rn)[ feats + target ] #y included
ktest = ktestdf.rename(columns=rn)[feats]

In [22]:
train.shape

(10506, 6)

In [23]:
## define transformations ##

# loc
from collections import namedtuple
coord = namedtuple('coordinate',['lat','long'])
loc = ColumnApplier(lambda row: coord(row['lat'],row['long']), name='loc', axis=1)

# date_to_utc defined above
utc = ColumnMapper(func=date_to_utc,column='date',name='utc',drop=False) # keep 'date' column and utc

#merge w/weather from weatherdf by DATE
merge_weather = DfMerger(weather,on=['utc','date'],how='left',copy=True,validate='m:1')

#dummy trap locations
dummy_address = DummyEncoder(column='address')

#dummy species
dummy_species = DummyEncoder(column='species')

#drop stuff we hate
drop_us = 'lat long loc utc date 2007 2008 2009 2010 2011 2012 2013 2014'.split()
drop_columns = ColumnSelector(columns=drop_us,drop=True)

In [24]:
#feature prep
pipe = []
pipe.append(('loc',loc))
pipe.append(('utc',utc))
pipe.append(('merge_weather',merge_weather))
pipe.append(('dummy_species',dummy_species))
pipe.append(('dummy_address',dummy_address))
pipe.append(('drop_columns',drop_columns))
prep_features = Pipeline(pipe)

In [25]:
#build sets
t = prep_features.fit_transform(train) #for training
k = prep_features.transform(ktest) #for kaggle submission
Xk = k

#define X and y
y = t[target].values.ravel()
X = t.drop(target,axis=1)

feats = list(train.columns)#recatalogue cols for later use if necessary

#train test split
Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,stratify=y)

AttributeError: 'DummyEncoder' object has no attribute 'dummies'

In [281]:
Xtrain.shape

(7879, 221)

In [292]:
train.columns

Index(['date', 'address', 'species', 'lat', 'long', 'WnvPresent', 'loc',
       'utc'],
      dtype='object')

In [297]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [384]:
def scores(model):
    try:
        print('ROC-AUC w/df: %f6, %f6'%
              (roc_auc_score(ytrain,model.decision_function(Xtrain)),
              roc_auc_score(ytest,model.decision_function(Xtest)))
             )
    except AttributeError:
        print('ROC-AUC w/ proba[1]: %f6, %f6'%
              (roc_auc_score(ytrain,model.predict_proba(Xtrain)),
              roc_auc_score(ytest,model.predict_proba(Xtest)))
             )
    pass

scores(lr)

ROC-AUC w/df: 0.8692996, 0.8376166


In [385]:
lr = LogisticRegression().fit(Xtrain,ytrain)
lr.score(Xtrain,ytrain)

scores(lr)

yhat = lr.predict_proba(Xk)[:,1]
catch = submit(yhat,'predictproba')

ROC-AUC w/df: 0.8719546, 0.8367136


In [476]:
lr = LogisticRegression(penalty='l2',C=.1).fit(Xtrain,ytrain)
lr.score(Xtrain,ytrain)

scores(lr)

yhat = lr.predict_proba(Xk)[:,1]
catch = submit(yhat,'l2-0-1')

ROC-AUC w/df: 0.8429066, 0.8177496


In [414]:
print(pd.DataFrame(abs(lr.coef_),columns=Xtrain.columns).T.sort_values(0,ascending=False).head(20))

                                              0
08                                     0.810293
4100  N OAK PARK AVE, Chicago, IL      0.586223
22                                     0.560808
1000  W OHARE AIRPORT, Chicago, IL     0.551954
CULEX PIPIENS                          0.539615
HighRisk                               0.517421
4600  N MILWAUKEE AVE, Chicago, IL     0.471284
CULEX TERRITANS                        0.465625
17                                     0.444994
8200  S KOSTNER AVE, Chicago, IL       0.427131
15                                     0.425523
21                                     0.390900
Thunderstorm                           0.390366
06                                     0.388504
29                                     0.388213
August                                 0.381017
27                                     0.378531
5800  N WESTERN AVE, Chicago, IL       0.372694
1000  S STONY ISLAND AVE, Chicago, IL  0.351656
18                                     0

In [387]:
lr = LogisticRegression(penalty='l1',C=1).fit(Xtrain,ytrain)
lr.score(Xtrain,ytrain)

scores(lr)

yhat = lr.predict_proba(Xk)[:,1]
catch = submit(yhat,'l1-1')

ROC-AUC w/df: 0.8692466, 0.8375786


In [388]:
lr = LogisticRegression(penalty='l1',C=.1).fit(Xtrain,ytrain)
lr.score(Xtrain,ytrain)

scores(lr)

yhat = lr.predict_proba(Xk)[:,1]
#catch = submit(yhat,'l2-0-1')

ROC-AUC w/df: 0.7996496, 0.7803326


In [389]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier().fit(Xtrain,ytrain)

In [426]:
print(roc_auc_score(ytrain,tree.predict(Xtrain)))
print(roc_auc_score(ytest,tree.predict(Xtest)))

0.845845151774404
0.5849767964551271


In [467]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_features=10,max_depth=21).fit(Xtrain,ytrain)
print(roc_auc_score(ytrain,tree.predict(Xtrain)))
print(roc_auc_score(ytest,tree.predict(Xtest)))

0.6159548792297479
0.5275865984243716


In [529]:
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier
bag = BaggingClassifier(n_estimators=15,max_features=140,bootstrap_features=True).fit(Xtrain,ytrain)

In [527]:
bag.score(Xtrain,ytrain)
bag.score(Xtest,ytest)

yhat = bag.predict_proba(Xk)[:,1]
catch = submit(yhat,'bag_15est_140bootfeat')

In [528]:
print(roc_auc_score(ytrain,bag.predict(Xtrain)))
print(roc_auc_score(ytest,bag.predict(Xtest)))

0.5710267498373579
0.5310016245392772


In [564]:
forest = RandomForestClassifier(n_estimators=15,max_features=140,min_impurity_decrease=.00009).fit(Xtrain,ytrain)
print(roc_auc_score(ytrain,forest.predict(Xtrain)))
print(roc_auc_score(ytest,forest.predict(Xtest)))

yhat = bag.predict_proba(Xk)[:,1]
#catch = submit(yhat,'forest_15est_140feat_minimpur00009')

0.6552367504276043
0.5358373946815262


In [745]:
from keras.models import Sequential
from keras.layers import Dense,Activation,Dropout
from keras.utils.np_utils import to_categorical

In [572]:
Xtrain.shape[1]

221

In [719]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler().fit(Xtrain)

In [805]:
nn = Sequential()
nn.add(Dense(48, input_dim=Xtrain.shape[1]))
nn.add(Activation('relu'))
nn.add(Dropout(9))
#nn.add(Dense(2))
#nn.add(Activation('relu'))
#nn.add(Activation('relu'))
nn.add(Dense(2))
nn.add(Activation('sigmoid'))


nn.compile(optimizer='adam',
          loss='categorical_crossentropy',
          metrics=['acc'])

In [806]:
nn.fit(ss.transform(Xtrain.values), to_categorical(ytrain), epochs=20, batch_size=32, validation_data=(ss.transform(Xtest.values),to_categorical(ytest)))

Train on 7879 samples, validate on 2627 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1a4dfe7e48>

In [807]:
print(roc_auc_score(to_categorical(ytrain),nn.predict_proba(ss.transform(Xtrain))),roc_auc_score(to_categorical(ytest),nn.predict_proba(ss.transform(Xtest))))

0.8977193786975532 0.7715746385545676


In [803]:
yhat = nn.predict(ss.transform(Xk))
catch = submit(yhat[:,1],'nn_48l_2l_48dense_9drop')

In [798]:
yhat[:,1].shape

(116293,)

In [847]:
nn = Sequential()
nn.add(Dense(360, input_dim=Xtrain.shape[1]))
nn.add(Activation('relu'))
nn.add(Dropout(.50))
nn.add(Dense(48))
nn.add(Activation('relu'))
nn.add(Dropout(.50))
#nn.add(Dense(2))
#nn.add(Activation('relu'))
#nn.add(Activation('relu'))
nn.add(Dense(2))
nn.add(Activation('sigmoid'))


nn.compile(optimizer='adam',
          loss='categorical_crossentropy',
          metrics=['acc'])

In [848]:
nn.fit(ss.transform(Xtrain.values), to_categorical(ytrain), epochs=50, batch_size=32, validation_data=(ss.transform(Xtest.values),to_categorical(ytest)))

Train on 7879 samples, validate on 2627 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1a4fd7b898>

In [850]:
print(roc_auc_score(to_categorical(ytrain),nn.predict_proba(ss.transform(Xtrain))),roc_auc_score(to_categorical(ytest),nn.predict_proba(ss.transform(Xtest))))

0.9904630612773062 0.7628915052317151


In [851]:
yhat = nn.predict(ss.transform(Xk))
catch = submit(yhat[:,1],'nn_2l_360_48')