# Problem Statement

__The intention of the project is to analyze weather data and GIS data and predicting whether or not West Nile virus is present, for a given time, location, and species.__

* Dataset: mosquito trap results, weather data, spray operations
* Initial features: date, location, species, distance from spray, time elapsed since the last spray, distance from weather station, weather conditions at closest station
* Target: boolean variable of infectious mosquitos present
* Success criteria: finish :)

In [76]:
import pandas as pd
from geopy.distance import vincenty
import numpy as np
import datetime as dt

In [123]:
df_train = pd.read_csv('west_nile/input/train.csv')
df_test = pd.read_csv('west_nile/input/test.csv')

In [124]:
df_train.head(1)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0


In [125]:
# assigning closest station
station_1 = [41.995, -87.933]
station_2 = [41.786, -87.752]

In [133]:
def weather_station(row):
    location = [row[8], row[9]]
    if vincenty(location, station_1).miles > vincenty(location, station_2).miles:
        return 2
    else:
        return 1

In [132]:
df_train.head()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,weather_station
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,1
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,1
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0,1
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,0,1
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,0,1


In [127]:
df_train['weather_station'] = df_train.apply(weather_station, axis=1)

In [134]:
df_test['weather_station'] = df_test.apply(weather_station, axis=1)

In [9]:
# sprays
df_sprays = pd.read_csv('west_nile/input/spray.csv')
df_sprays.head()

Unnamed: 0,Date,Time,Latitude,Longitude
0,2011-08-29,6:56:58 PM,42.391623,-88.089163
1,2011-08-29,6:57:08 PM,42.391348,-88.089163
2,2011-08-29,6:57:18 PM,42.391022,-88.089157
3,2011-08-29,6:57:28 PM,42.390637,-88.089158
4,2011-08-29,6:57:38 PM,42.39041,-88.088858


In [10]:
groupped_traps = df_train.groupby(['Trap'])['Latitude', 'Longitude'].mean()

In [11]:
groupped_traps.head(3)

Unnamed: 0_level_0,Latitude,Longitude
Trap,Unnamed: 1_level_1,Unnamed: 2_level_1
T001,41.953705,-87.733974
T002,41.95469,-87.800991
T003,41.964242,-87.757639


### assigning sprays to traps

trap_list = groupped_traps.index.tolist()

def spray_distance(row):
    spray_loc = [row[2], row[3]]
    trap_loc = [trap_lat, trap_lon]
    y = vincenty(spray_loc, trap_loc).miles
    if y < 0.5:
        return 1
    else:
        return 0

for x in trap_list:
    trap_lat = groupped_traps['Latitude'][x]
    trap_lon = groupped_traps['Longitude'][x]
    df_sprays[x] = df_sprays.apply(spray_distance, axis=1)
    
df_sprays.to_csv('sprays_traps.csv')

In [44]:
df_spraytrap = pd.read_csv('sprays_traps.csv')

In [45]:
df_train.head(1)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,weather_station
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,1


In [46]:
df_spraytrap.drop('Unnamed: 0', axis=1, inplace=True)

In [47]:
df_spraytrap.head(1)

Unnamed: 0,Date,Time,Latitude,Longitude,T001,T002,T003,T004,T005,T006,...,T230,T231,T232,T233,T235,T236,T237,T238,T900,T903
0,2011-08-29,6:56:58 PM,42.391623,-88.089163,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
df_spraytrap.set_index('Date', inplace=True)

In [49]:
df_spraytrap.index = pd.to_datetime(df_spraytrap.index)

In [50]:
df_spraytrap.sort_index(inplace=True, ascending=True)
df_spraytrap.head(2)

Unnamed: 0_level_0,Time,Latitude,Longitude,T001,T002,T003,T004,T005,T006,T007,...,T230,T231,T232,T233,T235,T236,T237,T238,T900,T903
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-08-29,6:56:58 PM,42.391623,-88.089163,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2011-08-29,6:57:08 PM,42.391348,-88.089163,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
df_spraytrap.drop(['Time', 'Latitude', 'Longitude'], axis=1, inplace=True)

In [52]:
#data[column].resample('W').median() #weekly median (else: D, M, A)

df_spraytrap = df_spraytrap.resample('D').max()

In [53]:
df_spraytrap.replace(np.nan, 0, inplace=True)

In [111]:
# calculating the lap between the spray date

trap_list = groupped_traps.index.tolist()

def lap_spray(row):
    date_spray=df_spraytrap.loc[df_spraytrap[x]==1,:].index.tolist()
    date_i=row.name
    lap=dt.timedelta(days=900).days + (date_i-df_spraytrap.head(1).index.tolist()[0]).days
    for item in date_spray:
        if date_i < item:
            pass
        else:
            lap=(date_i-item).days
    return lap

for x in trap_list:
    df_spraytrap[x] = df_spraytrap.apply(lap_spray, axis=1)
    
df_spraytrap.to_csv('sprays_traps_laps.csv')

In [114]:
df_spraytrap

Unnamed: 0_level_0,T001,T002,T003,T004,T005,T006,T007,T008,T009,T011,...,T230,T231,T232,T233,T235,T236,T237,T238,T900,T903
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-08-29,900,900,900,900,900,900,900,900,900,900,...,900,900,900,900,900,900,900,900,900,900
2011-08-30,901,901,901,901,901,901,901,901,901,901,...,901,901,901,901,901,901,901,901,901,901
2011-08-31,902,902,902,902,902,902,902,902,902,902,...,902,902,902,902,902,902,902,902,902,902
2011-09-01,903,903,903,903,903,903,903,903,903,903,...,903,903,903,903,903,903,903,903,903,903
2011-09-02,904,904,904,904,904,904,904,904,904,904,...,904,904,904,904,904,904,904,904,904,904
2011-09-03,905,905,905,905,905,905,905,905,905,905,...,905,905,905,905,905,905,905,905,905,905
2011-09-04,906,906,906,906,906,906,906,906,906,906,...,906,906,906,906,906,906,906,906,906,906
2011-09-05,907,907,907,907,907,907,907,907,907,907,...,907,907,907,907,907,907,907,907,907,907
2011-09-06,908,908,908,908,908,908,908,908,908,908,...,908,908,908,908,908,908,908,908,908,908
2011-09-07,909,909,909,909,909,0,909,909,909,909,...,909,909,909,909,909,909,909,909,909,909


In [115]:
df_weather.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,...,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,...,,0,M,0.0,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,...,,M,M,M,0.0,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51,-3,42,47,14,0,...,BR,0,M,0.0,0.0,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52,M,42,47,13,0,...,BR HZ,M,M,M,0.0,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56,2,40,48,9,0,...,,0,M,0.0,0.0,29.39,30.12,11.7,7,11.9


In [135]:
df_train.to_csv('train2.csv')

In [136]:
df_test.to_csv('test2.csv')