In [41]:
import pandas as pd
import numpy as np

# Trap Distance Matrices

In [42]:
trap_dist = pd.read_csv('./data/trap_distances.csv')

In [43]:
trap_dist.head()

Unnamed: 0,InputID,TargetID,Distance
0,T002,T051,16968.51836
1,T002,T047,12957.56269
2,T002,T043,13932.91805
3,T002,T013,3675.98196
4,T002,T219,10560.04005


In [44]:
trap_dist.rename(index=str, columns={'InputID':'Trap1', 'TargetID':'Trap2'});

In [45]:
trap_dist['Distance'].describe()

count    22650.000000
mean     16578.515328
std       9675.228351
min        107.751410
25%       8733.918890
50%      14982.559290
75%      23487.790640
max      47231.600130
Name: Distance, dtype: float64

In [46]:
trap_dist['neighbor'] = np.where(trap_dist['Distance'] < 1500, 1, 0)

In [47]:
trap_dist['neighbor'].value_counts()

0    22454
1      196
Name: neighbor, dtype: int64

In [48]:
trap_dist.shape

(22650, 4)

In [49]:
neighbors = trap_dist[trap_dist['neighbor'] == 1]

In [50]:
neighbors.head()

Unnamed: 0,InputID,TargetID,Distance,neighbor
79,T002,T002B,1023.77361,1
210,T007,T236,138.5751,1
328,T015,T009,1322.86214,1
523,T045,T232,1012.82312,1
648,T046,T049,576.40827,1


# Build trap history table 

In [51]:
df = pd.read_csv('./data/train.csv')
df.drop(['Address', 'Block', 'Street', 'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy', 'Species'], inplace = True, axis = 1)
df.sort_values('Trap', inplace=True)

df_history = df.groupby(['Trap', 'Date'])['NumMosquitos', 'WnvPresent'].sum()
df_history.sort_index(inplace=True)

df.head()

Unnamed: 0,Date,Trap,NumMosquitos,WnvPresent
359,2007-07-11,T001,1,0
2269,2007-08-21,T001,1,0
3102,2007-09-12,T001,1,0
1896,2007-08-15,T001,5,0
1895,2007-08-15,T001,5,0


In [52]:
traps = pd.read_csv('./data/trap_lists.csv')
traps.drop(['field_1', 'Latitude', 'Longitude'], axis = 1, inplace = True)
traps.sort_values('Trap', inplace=True)
traps.drop_duplicates(inplace=True)
print(traps.shape)
traps.head()

(149, 1)


Unnamed: 0,Trap
44,T001
0,T002
140,T002A
141,T002B
45,T003


In [53]:
extras_list = ['T002A',  'T002B',  'T065A', 'T090A', 'T090B', 'T090C', 'T128A',
               'T200A', 'T200B', 'T218', 'T218A', 'T218B', 'T218C', 'T234']

In [58]:
df_history.loc['T001'].shape

(10, 2)

In [59]:
def make_trap_history(trap):
    trap_dict = {}
    
    if trap in extras_list:
        one_hot = 0
        percent = 0
        avg = 0
        total_obs = 0
    
    else:
        #base values
        total_mosq = df_history.loc[trap]['NumMosquitos'].sum()
        total_pos = df_history.loc[trap]['WnvPresent'].sum()
        total_obs = df_history.loc[trap].shape[0]
        
        #one hot val
        if total_pos > 0:
            one_hot = 1
        else:
            one_hot = 0
        

        #percent positive val
        percent = total_pos / total_obs


        #average mosquitos per check
        avg = total_mosq / total_obs
        
    trap_dict['any_positive'] = one_hot
    trap_dict['percent_pos'] = percent
    trap_dict['avg_mosq_per_obs'] = avg
    trap_dict['n_obs'] = total_obs
    
    return trap_dict

In [60]:
traps_list = list(traps['Trap'])
dict_list = []
for trap in traps_list:
    dict_list.append(make_trap_history(trap))

df_trap_history = pd.DataFrame(dict_list, index = traps_list)
df_trap_history['trap_id'] = df_trap_history.index

print(df_trap_history.shape)
df_trap_history.sort_index(inplace=True)
df_trap_history.head(19)

(149, 5)


Unnamed: 0,any_positive,avg_mosq_per_obs,n_obs,percent_pos,trap_id
T001,0,2.5,10,0.0,T001
T002,1,55.373134,67,0.268657,T002
T002A,0,0.0,0,0.0,T002A
T002B,0,0.0,0,0.0,T002B
T003,1,21.03125,64,0.21875,T003
T004,0,6.555556,9,0.0,T004
T005,1,5.555556,9,0.222222,T005
T006,1,7.133333,15,0.266667,T006
T007,0,5.133333,15,0.0,T007
T008,1,44.466667,60,0.166667,T008


In [61]:
df_trap_history[80:100]

Unnamed: 0,any_positive,avg_mosq_per_obs,n_obs,percent_pos,trap_id
T096,1,34.0,17,0.352941,T096
T097,1,14.333333,9,0.111111,T097
T099,0,4.6,30,0.0,T099
T100,0,21.727273,11,0.0,T100
T102,1,15.175439,57,0.035088,T102
T103,1,249.0,12,0.5,T103
T107,1,7.166667,12,0.166667,T107
T114,1,36.627119,59,0.152542,T114
T115,1,328.30303,66,0.621212,T115
T128,1,58.157895,57,0.192982,T128


In [62]:
make_trap_history('T009')

{'any_positive': 1,
 'percent_pos': 0.16666666666666666,
 'avg_mosq_per_obs': 35.925925925925924,
 'n_obs': 54}

# Neighbor Score

In [63]:
neighbors.sort_values('InputID', inplace=True)
print(neighbors.shape)
neighbors.head()

(196, 4)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,InputID,TargetID,Distance,neighbor
6620,T001,T218,671.40894,1
6693,T001,T218B,857.80845,1
6695,T001,T218A,281.66476,1
6676,T001,T228,766.19898,1
79,T002,T002B,1023.77361,1


In [64]:
neighbor_lookup = neighbors.groupby(['InputID', 'TargetID'])['Distance', 'neighbor'].sum()
neighbor_lookup.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Distance,neighbor
InputID,TargetID,Unnamed: 2_level_1,Unnamed: 3_level_1
T001,T218,671.40894,1
T001,T218A,281.66476,1
T001,T218B,857.80845,1
T001,T228,766.19898,1
T002,T002B,1023.77361,1


In [65]:
df_trap_history.loc['T001']['avg_mosq_per_obs']

2.5

In [68]:
has_neighbor_list = list(neighbors.InputID.unique())

def neighbor_history_lookup(trap):
    neighbor_score = 0
    if trap in has_neighbor_list: 
        neighbors = list(neighbor_lookup.loc[trap].index)
        for n in neighbors:
            distance = neighbor_lookup.loc[trap, n]['Distance']
            weight = 1 / distance

            pos_value = 10 * df_trap_history.loc[n]['any_positive']
            neigh_onehot_score = weight * pos_value

            mosq_value = df_trap_history.loc[n]['avg_mosq_per_obs']
            neigh_mosq_score = mosq_value * weight

            posrate_value = df_trap_history.loc[n]['percent_pos']
            neigh_posrate_score = 100 *posrate_value * weight

    else:
        neigh_mosq_score = 0
        neigh_onehot_score = 0
        neigh_posrate_score = 0
    
    scores = (neigh_mosq_score, neigh_onehot_score, neigh_posrate_score)
    return scores

In [70]:
neighbor_history_lookup('T001')

(0.02797358635655366, 0.013051439979729546, 0.02610287995945909)

In [71]:
neigh_mosq_scores = []
neigh_onehot_scores = []
neigh_posrate_scores = []
for trap in traps_list:
    scores = neighbor_history_lookup(trap)
    print(scores)
    neigh_mosq_scores.append(scores[0])
    neigh_onehot_scores.append(scores[1])
    neigh_posrate_scores.append(scores[2])
    
print(len(neigh_mosq_scores), len(neigh_onehot_scores), len(neigh_posrate_scores))   
    

(0.02797358635655366, 0.013051439979729546, 0.02610287995945909)
(0.0, 0.0, 0.0)
(0.0039902348770579745, 0.00687971530527237, 0.00687971530527237)
(0.01380791679622928, 0.007225072742212996, 0.004013929301229442)
(0.008120317216069423, 0.0, 0.0)
(0.0028538054852942545, 0.0, 0.0)
(0.0, 0.0, 0.0)
(0.01450196704176538, 0.010088324898619395, 0.037831218369822735)
(0.14242704573378015, 0.07216303650511528, 0.037980545529008036)
(0, 0, 0)
(0.028897758808936972, 0.009023500018403427, 0.02030287504140771)
(0, 0, 0)
(0.003698080642729703, 0.0, 0.0)
(0.013129598965958854, 0.011517192075402503, 0.007678128050268335)
(0.021785756375202796, 0.006802734231132801, 0.015306152020048803)
(0.02715772478447824, 0.007559366692586727, 0.012598944487644544)
(0.0759714295307361, 0.025056540082696605, 0.0701583122315505)
(0.0069109123699528634, 0.011915366155091143, 0.011915366155091143)
(0.003091243176011688, 0.0, 0.0)
(0.00714312102657627, 0.0, 0.0)
(0.01200975117112609, 0.007190712360863311, 0.005046113937

In [72]:
df_trap_history.shape

(149, 5)

In [73]:
df_trap_history['neighbor_mosq_score'] = neigh_mosq_scores
df_trap_history['neighbor_onehot_score'] = neigh_onehot_scores
df_trap_history['neighbor_posrate_score'] = neigh_posrate_scores

In [74]:
df_trap_history.drop('trap_id', inplace=True, axis =1)


In [75]:
df_trap_history['neighbor_onehot_score'].sum()

0.9371677581793686

In [76]:
df_trap_history.drop_duplicates(inplace=True)
df_trap_history.to_csv('./data/trap_history.csv')