In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from six import StringIO
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

In [4]:
from our_functions import display_acc_and_f1_score

In [5]:
df = pd.read_csv('data/cleaned_data.csv')

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,sample_pk,commod,commtype,lab,pestcode,lod,annotate,avg_detect,extract,determin,concentration
0,50238,239,AJ,RE,WA1,083,0.005,Q,O,805,35,80000.0
1,249096,1183,BR,FR,FL1,AFU,0.01,V,O,805,52,110000.0
2,251475,1196,BR,FR,FL1,AFU,0.01,V,O,805,52,130000.0
3,257567,1230,BR,FR,FL1,144,0.005,V,O,805,35,350000.0
4,264693,1269,BR,FR,FL1,180,0.01,V,O,805,52,260000.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4887 entries, 0 to 4886
Data columns (total 12 columns):
Unnamed: 0       4887 non-null int64
sample_pk        4887 non-null int64
commod           4887 non-null object
commtype         4887 non-null object
lab              4887 non-null object
pestcode         4887 non-null object
lod              4887 non-null float64
annotate         4887 non-null object
avg_detect       4887 non-null object
extract          4887 non-null int64
determin         4887 non-null int64
concentration    4887 non-null float64
dtypes: float64(2), int64(4), object(6)
memory usage: 458.2+ KB


## Split data

In [8]:
X = df.drop(['annotate'], axis=1)
y = df.annotate

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=20)

## One-hot-encode data. Note that splitting the date before encoding resulted in too many different categories match dimensions of data to test, so we had to onehotencode first.

In [10]:
categoricals = ['commod', 'commtype', 'lab', 'pestcode', 'avg_detect']

In [11]:
encoder= OneHotEncoder(categories = 'auto', drop='first')
encoder = encoder.fit(X[categoricals])

In [12]:
encoder.categories_

[array(['AJ', 'BR', 'BU', 'CE', 'CF', 'CR', 'GB', 'GJ', 'IA', 'MU', 'NE',
        'PC', 'PU', 'RS', 'RZ', 'SS', 'WG', 'WR', 'WS', 'WU'], dtype=object),
 array(['FR', 'FZ', 'PU', 'RE'], dtype=object),
 array(['CA1', 'FL1', 'MI1', 'MN1', 'MT1', 'NY1', 'OH1', 'TX1', 'WA1'],
       dtype=object),
 array(['014', '024', '026', '028', '032', '034', '042', '052', '070',
        '083', '114', '125', '129', '134', '143', '144', '149', '151',
        '157', '159', '160', '165', '170', '171', '172', '173', '178',
        '180', '200', '204', '222', '223', '230', '237', '249', '255',
        '264', '283', '305', '318', '333', '351', '382', '387', '512',
        '537', '539', '540', '597', '604', '607', '612', '624', '626',
        '633', '651', '666', '679', '699', '701', '714', '717', '718',
        '720', '731', '758', '780', '781', '784', '785', '808', '900',
        '901', '902', '906', '907', '908', '910', '911', '930', '942',
        '945', '948', '956', '964', '967', 'A25', 'A30', 'A42', 'A5

In [13]:
ohe = pd.DataFrame(encoder.transform(X[categoricals]).toarray(), 
                   columns=encoder.get_feature_names(categoricals))
ohe.head()

Unnamed: 0,commod_BR,commod_BU,commod_CE,commod_CF,commod_CR,commod_GB,commod_GJ,commod_IA,commod_MU,commod_NE,...,pestcode_B64,pestcode_B72,pestcode_B75,pestcode_B77,pestcode_B79,pestcode_B80,pestcode_B82,pestcode_B84,pestcode_B85,avg_detect_R
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
ohe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4887 entries, 0 to 4886
Columns: 179 entries, commod_BR to avg_detect_R
dtypes: float64(179)
memory usage: 6.7 MB


In [15]:
X = X.join(ohe).drop(categoricals, axis=1)
X.head()

Unnamed: 0.1,Unnamed: 0,sample_pk,lod,extract,determin,concentration,commod_BR,commod_BU,commod_CE,commod_CF,...,pestcode_B64,pestcode_B72,pestcode_B75,pestcode_B77,pestcode_B79,pestcode_B80,pestcode_B82,pestcode_B84,pestcode_B85,avg_detect_R
0,50238,239,0.005,805,35,80000.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,249096,1183,0.01,805,52,110000.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,251475,1196,0.01,805,52,130000.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,257567,1230,0.005,805,35,350000.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,264693,1269,0.01,805,52,260000.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
features = encoder.get_feature_names()
one_hot_encoded_frame = pd.DataFrame(e_X, columns=features)

In [None]:
one_hot_encoded_frame.info()

In [None]:
one_hot_encoded_frame.reset_index().tail()

In [None]:
X_dropped = X.drop(categoricals, axis=1)
encoded_X = X_dropped.join(one_hot_encoded_frame)

In [None]:
encoded_X.fillna(0, inplace=True)

In [None]:
encoded_X.info()

## Split encoded data

In [None]:
enc_X_train, enc_X_test, y_train, y_test = train_test_split(encoded_X, y, test_size = 0.20, random_state=20)

## Nearest neighbors model

In [None]:
knc = KNeighborsClassifier()
knc.fit(enc_X_train, y_train)

In [None]:
y_train

In [None]:
k_preds = knc.predict(enc_X_test)

In [None]:
y_test

In [None]:
k_preds

In [None]:
knc.score(enc_X_test, y_test)

## Logistic regression model

In [None]:
import statsmodels.api as sm

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
mul_r = LogisticRegression(multi_class='multinomial', solver='lbfgs')
mul_r.fit(enc_X_train, y_train)

In [None]:
mul_r.score(enc_X_test, y_test)

In [None]:
mul_r.score(enc_X_train, y_train)

In [None]:
mul_r.predict(enc_X_test)

In [None]:
mul_r.predict(enc_X_train)

In [None]:
display_acc_and_f1_score(y_train, )

In [None]:
print "Multinomial Logistic regression Train Accuracy :: ",  metrics.accuracy_score(y_train, mul_r.predict(enc_X_train))
print "Multinomial Logistic regression Test Accuracy :: ", metrics.accuracy_score(y_test, mul_lr.predict(enc_X_test))