# Computing a dataframe with empirical probalities of WNV | Trap # & species

In [20]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime
import time
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn.cluster import KMeans, k_means
from sklearn.metrics import silhouette_score

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 999)


%matplotlib inline

In [21]:
kaggle_train = pd.read_csv('../data/train.csv')
kaggle_test = pd.read_csv('../data/test.csv')


In [22]:
X_k_train = kaggle_train[['Date', 'Species', 'Trap', 'Latitude', 'Longitude', 'NumMosquitos','WnvPresent']].copy()
X_cols = list(X_k_train.columns.drop(['WnvPresent','NumMosquitos']))
X_cols

['Date', 'Species', 'Trap', 'Latitude', 'Longitude']

In [23]:
X_k_train['Date'] = pd.to_datetime(kaggle_train['Date'])

In [24]:
X_k_train.head()

Unnamed: 0,Date,Species,Trap,Latitude,Longitude,NumMosquitos,WnvPresent
0,2007-05-29,CULEX PIPIENS/RESTUANS,T002,41.95469,-87.800991,1,0
1,2007-05-29,CULEX RESTUANS,T002,41.95469,-87.800991,1,0
2,2007-05-29,CULEX RESTUANS,T007,41.994991,-87.769279,1,0
3,2007-05-29,CULEX PIPIENS/RESTUANS,T015,41.974089,-87.824812,1,0
4,2007-05-29,CULEX RESTUANS,T015,41.974089,-87.824812,4,0


In [36]:
spec = list(set(X_k_train['Species']))
n = len(set(X_k_train['Trap']))

df = pd.DataFrame(0, index = range(904), columns = spec)

df.head()

Unnamed: 0,CULEX TARSALIS,CULEX TERRITANS,CULEX PIPIENS,CULEX RESTUANS,CULEX ERRATICUS,CULEX SALINARIUS,CULEX PIPIENS/RESTUANS
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0


In [29]:
X_k_train['Trap'][0][1:4]

'002'

In [30]:
def strip_trap(trap_in):
    return (int(trap_in[1:4]))

In [32]:
X_k_train['Trap'] = X_k_train['Trap'].map(strip_trap)

In [35]:
max(X_k_train['Trap'])

903

In [37]:
def add_to_df(df_in, species, trap_in):
    df_in.loc[trap_in,species] += 1
    return(df_in)

for each in range(len(X_k_train)):  
    df = add_to_df(df, X_k_train.loc[each,'Species'], X_k_train.loc[each,'Trap'])


df.head()       # number of occurance of species / trap in the training data

Unnamed: 0,CULEX TARSALIS,CULEX TERRITANS,CULEX PIPIENS,CULEX RESTUANS,CULEX ERRATICUS,CULEX SALINARIUS,CULEX PIPIENS/RESTUANS
0,0,0,0,0,0,0,0
1,0,1,6,0,0,0,8
2,0,1,41,59,0,1,83
3,0,1,26,34,0,1,60
4,0,0,5,2,0,0,5


In [38]:
def add_to_df2(df_in, species, Trap, Wnv):
    df_in.loc[Trap,species] += Wnv
    return(df_in)

df2 = pd.DataFrame(0, index = range(904), columns = spec)
for each in range(len(X_k_train)):  
    df2 = add_to_df2(df2, X_k_train.loc[each,'Species'], X_k_train.loc[each,'Trap'],X_k_train.loc[each,'WnvPresent'])


df2.head()           # number of WNV Present occurances by Species and Number in trap

Unnamed: 0,CULEX TARSALIS,CULEX TERRITANS,CULEX PIPIENS,CULEX RESTUANS,CULEX ERRATICUS,CULEX SALINARIUS,CULEX PIPIENS/RESTUANS
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,7,0,0,0,11
3,0,0,5,0,0,0,9
4,0,0,0,0,0,0,0


In [39]:
df3 = df2.copy()
for i in range(len(df2)):
    for j in df2.columns:
        if df.loc[i,j] == 0:
            df3.loc[i,j] = 0
        else:
            df3.loc[i,j] = df2.loc[i,j] / df.loc[i,j]
            
df3.head()     # Empirical probabilities of WNV present given Species and Number of Mosquitos in trap

Unnamed: 0,CULEX TARSALIS,CULEX TERRITANS,CULEX PIPIENS,CULEX RESTUANS,CULEX ERRATICUS,CULEX SALINARIUS,CULEX PIPIENS/RESTUANS
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.170732,0.0,0.0,0.0,0.13253
3,0.0,0.0,0.192308,0.0,0.0,0.0,0.15
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
df3.describe()

Unnamed: 0,CULEX TARSALIS,CULEX TERRITANS,CULEX PIPIENS,CULEX RESTUANS,CULEX ERRATICUS,CULEX SALINARIUS,CULEX PIPIENS/RESTUANS
count,904.0,904.0,904.0,904.0,904.0,904.0,904.0
mean,0.0,0.0,0.011517,0.001647,0.0,0.0,0.006614
std,0.0,0.0,0.042329,0.009998,0.0,0.0,0.025386
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,0.0,0.333333,0.125,0.0,0.0,0.214286


In [41]:
print('probabilities by species')
df2.sum()/df.sum()                     # probability of WNV given species

probabilities by species


CULEX TARSALIS            0.000000
CULEX TERRITANS           0.000000
CULEX PIPIENS             0.088922
CULEX RESTUANS            0.017883
CULEX ERRATICUS           0.000000
CULEX SALINARIUS          0.000000
CULEX PIPIENS/RESTUANS    0.055135
dtype: float64

In [46]:
CP = df3.sort_values(by = 'CULEX PIPIENS', ascending = False)[0:20].index

In [48]:
CR = df3.sort_values(by = 'CULEX RESTUANS', ascending = False)[0:20].index

In [49]:
CRP = df3.sort_values(by = 'CULEX PIPIENS/RESTUANS', ascending = False)[0:20].index

In [51]:
traps_with_highest_WNV = set(CP).union(set(CR), set(CRP))

In [53]:
len(traps_with_highest_WNV)

42

In [54]:
traps_with_highest_WNV


{2,
 3,
 5,
 6,
 9,
 11,
 13,
 14,
 15,
 16,
 27,
 28,
 35,
 45,
 49,
 61,
 82,
 86,
 89,
 90,
 95,
 96,
 97,
 107,
 142,
 143,
 154,
 160,
 215,
 221,
 223,
 225,
 226,
 227,
 228,
 230,
 231,
 232,
 233,
 235,
 900,
 903}

In [59]:
def put_t_back(obs_in):
    obs_in = str(obs_in)
    if len(obs_in) == 1:
        obs_out = 'T00'+obs_in
    if len(obs_in) == 2:
        obs_out = 'T0'+obs_in
    if len(obs_in) == 3:
        obs_out = 'T'+obs_in
    return(obs_out)

hotspot_traps = []
for each in traps_with_highest_WNV:
    hotspot_traps.append(put_t_back(each))

hotspot_traps

['T002',
 'T003',
 'T900',
 'T005',
 'T006',
 'T903',
 'T009',
 'T011',
 'T013',
 'T014',
 'T142',
 'T143',
 'T016',
 'T015',
 'T154',
 'T027',
 'T028',
 'T160',
 'T035',
 'T045',
 'T049',
 'T061',
 'T082',
 'T086',
 'T215',
 'T089',
 'T090',
 'T221',
 'T095',
 'T223',
 'T225',
 'T226',
 'T227',
 'T228',
 'T096',
 'T230',
 'T231',
 'T232',
 'T233',
 'T097',
 'T235',
 'T107']