In [1]:
import pandas as pd
import numpy as np
import scipy.stats.stats as stats
import pandas.core.algorithms as algos
from sklearn import preprocessing, model_selection
from library.sb_utils import save_file
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
#read trap data into pandas
filepath = '../data/data_cleaned.csv'
df = pd.read_csv(filepath)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10506 entries, 0 to 10505
Data columns (total 81 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Species                10506 non-null  object 
 1   NumMosquitos           10506 non-null  int64  
 2   WnvPresent             10506 non-null  int64  
 3   IsDowntownCore         10506 non-null  int64  
 4   IsNorthWestZone        10506 non-null  int64  
 5   IsNorthEastZone        10506 non-null  int64  
 6   TrapZone               10506 non-null  object 
 7   IsInSeason             10506 non-null  int64  
 8   Month                  10506 non-null  int64  
 9   Week                   10506 non-null  int64  
 10  IsPeakSeason           10506 non-null  int64  
 11  Tmax                   10506 non-null  int64  
 12  Tmin                   10506 non-null  int64  
 13  Tavg                   10506 non-null  int64  
 14  Tdepart                10506 non-null  int64  
 15  Td

In [3]:
#define the categorical and continuous variables
categorical_columns = ['Species','IsDowntownCore','IsNorthWestZone','IsNorthEastZone','TrapZone','IsInSeason','Month',
           'Week', 'IsPeakSeason', 'IsSprayed', 'IsOptimalTemp','PrecipConditions','PrecipWeekly_Score',
           'MoistureConditions','WindConditions','InSprayBounds', 'IsDayAfterSpray','IsNearSprayZone','IsRecentlySprayed',
           'WeatherCode_BR','WeatherCode_DZ','WeatherCode_HZ','WeatherCode_NONE','WeatherCode_RA','WeatherCode_TS',
           'WeatherCode_TSRA']

continuous_columns = ['PrecipWeekly', 'Tavg', 'Tmax','Tmin','Tdepart','Tdew_point','Twet_bulb','HeatDegDay','CoolDegDay', 
           'PrecipTotal','NumMosquitos', 'DaysSincePrecip', 'DaylightMinutes', 'RelHumidity', 'Wind_AvgSpeed', 
           'TrapSprayDistance', 'DaysSinceSpray','Prev_Check','Tavg_7days', 'Tavg_14days', 'Tavg_21days', 
           'Tavg_28days','Wind_AvgSpeed_7days','Wind_AvgSpeed_14days','Wind_AvgSpeed_21days', 'Wind_AvgSpeed_28days',
           'PrecipTotal_7days', 'PrecipTotal_14days', 'PrecipTotal_21days', 'PrecipTotal_28days', 'RelHumidity_7days', 
           'RelHumidity_14days','RelHumidity_21days', 'RelHumidity_28days', 'Tavg_lag7', 'Tavg_lag14','Tavg_lag21', 
           'Tavg_lag28', 'Wind_AvgSpeed_lag7', 'Wind_AvgSpeed_lag14','Wind_AvgSpeed_lag21', 'Wind_AvgSpeed_lag28', 
           'PrecipTotal_lag7','PrecipTotal_lag14', 'PrecipTotal_lag21', 'PrecipTotal_lag28','DaylightMinutes_lag7', 
           'DaylightMinutes_lag14','DaylightMinutes_lag21', 'DaylightMinutes_lag28', 'RelHumidity_lag7',
           'RelHumidity_lag14', 'RelHumidity_lag21', 'RelHumidity_lag28']

## Weight of Evidence

In [4]:
#get the bins for continuous variables
def continuous_bins(X,y,max_bins=20,min_bins=2):
    spearman_r = 0
    n = max_bins
    
    #create the bins
    while np.abs(spearman_r) < 1:
        
        #if below minimum bins manually create bins
        if n < min_bins + 1:
            bins = algos.quantile(X, np.linspace(0, 1, min_bins + 1))
            if len(np.unique(bins)) == 2:
                bins = np.insert(bins,0,1)
                bins[1] = bins[1]-(bins[1]/2)
            df_bins = pd.DataFrame({'X':X, 'y':y, "Bin": pd.cut(X, np.unique(bins),include_lowest=True)}) \
                .groupby('Bin')
            break
        
        #create bins and check Spearman correlation
        df_bins = pd.DataFrame({'X':X,'y':y,'Bin':pd.qcut(X,n,duplicates='drop')}).groupby('Bin')
        spearman_r, p_val = stats.spearmanr(df_bins['X'].mean(),df_bins['y'].mean())
        n -= 1

    return df_bins

In [5]:
#get the bins for discrete variables
def discrete_bins(X,y):
    return pd.DataFrame({'X':X, 'y':y}).groupby('X')

In [6]:
#get the weight of evidence table
def get_woe_table(df_bins, is_continuous, feature_name):
    
    #for continuous data
    if is_continuous:
        min_value = df_bins['X'].min()
        max_value = df_bins['X'].max()
    
    #for discrete data
    else:
        min_value = df_bins['y'].count().index
        max_value = min_value

        
    count_total = df_bins['y'].count()
    count_positive = df_bins['y'].sum()
    count_negative = count_total - count_positive
        
    df_woe_table =  pd.DataFrame({
        'Feature':feature_name,
        'Minimum':min_value,
        'Maximum':max_value,
        'Total':count_total,
        'TotalPositive':count_positive,
        'TotalNegative':count_negative,
        'PositiveRate': count_positive/count_total,
        'NegativeRate': count_negative/count_total,
        'ProportionOfPositives': count_positive / count_positive.sum(),
        'ProportionOfNegatives': count_negative / count_negative.sum(),
    })
    
    df_woe_table['WOE'] = np.log(df_woe_table['ProportionOfPositives']/df_woe_table['ProportionOfNegatives'])
    df_woe_table['IV'] = (df_woe_table['ProportionOfPositives'] - df_woe_table['ProportionOfNegatives']) \
        * df_woe_table['WOE']
    
    #handle infinity values
    df_woe_table = df_woe_table.replace([np.inf, -np.inf], 1)
    
    #drop any rows where there were no values in the bin
    df_woe_table = df_woe_table.drop(index = df_woe_table[df_woe_table['Total'] == 0].index)
    
    return df_woe_table.reset_index(drop=True)

    

In [7]:
#get WOE and IV values for each feature
df_woe = None
for column in categorical_columns:
    
    df_bins = discrete_bins(df[column],df['WnvPresent'])
    woe_table = get_woe_table(df_bins, False, column)
    
    if df_woe is None:
        df_woe = woe_table
    else:
        df_woe = pd.concat([df_woe,woe_table])
        
for column in continuous_columns:
    
    df_bins = continuous_bins(df[column],df['WnvPresent'])
    woe_table = get_woe_table(df_bins, True, column)
    
    if df_woe is None:
        df_woe = woe_table
    else:
        df_woe = pd.concat([df_woe,woe_table])
 

df_woe = df_woe.reset_index(drop=True)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [8]:
#get the max IV for each feature
df_woe = df_woe.groupby('Feature')['IV'].max().reset_index()
df_woe.sort_values(by='IV', ascending=False)

Unnamed: 0,Feature,IV
34,PrecipWeekly_Score,1.000000
45,Species,1.000000
60,TrapZone,1.000000
23,PrecipConditions,1.000000
21,Month,1.000000
...,...,...
24,PrecipTotal,0.001895
43,RelHumidity_lag28,0.000823
29,PrecipTotal_lag14,0.000390
57,Tmax,0.000072


## Train/Test Split

In [9]:
#separate the data into feature matrix (X) and target vector (y)
X = df.drop(columns='WnvPresent')
y = df['WnvPresent']

In [10]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.7, stratify=y)

## Encode Categorical Features

In [11]:
#Fit the encoder to the training data
encoder = preprocessing.OneHotEncoder(sparse=False, drop='first')
encoder.fit(X_train[categorical_columns])

#Transform the training data
encoded_columns = pd.DataFrame(encoder.transform(X_train[categorical_columns]))
encoded_columns.columns = encoder.get_feature_names(categorical_columns)
encoded_columns.index = X_train.index
X_train =  pd.concat([X_train,encoded_columns], axis=1).drop(columns=categorical_columns)


#Transform the test data
encoded_columns = pd.DataFrame(encoder.transform(X_test[categorical_columns]))
encoded_columns.columns = encoder.get_feature_names(categorical_columns)
encoded_columns.index = X_test.index
X_test = pd.concat([X_test,encoded_columns], axis=1).drop(columns=categorical_columns)

In [12]:
X_train.columns.to_list()

['NumMosquitos',
 'Tmax',
 'Tmin',
 'Tavg',
 'Tdepart',
 'Tdew_point',
 'Twet_bulb',
 'HeatDegDay',
 'CoolDegDay',
 'PrecipTotal',
 'Wind_AvgSpeed',
 'DaysSincePrecip',
 'PrecipWeekly',
 'DaylightMinutes',
 'RelHumidity',
 'Tavg_7days',
 'Tavg_14days',
 'Tavg_21days',
 'Tavg_28days',
 'Wind_AvgSpeed_7days',
 'Wind_AvgSpeed_14days',
 'Wind_AvgSpeed_21days',
 'Wind_AvgSpeed_28days',
 'PrecipTotal_7days',
 'PrecipTotal_14days',
 'PrecipTotal_21days',
 'PrecipTotal_28days',
 'RelHumidity_7days',
 'RelHumidity_14days',
 'RelHumidity_21days',
 'RelHumidity_28days',
 'Tavg_lag7',
 'Tavg_lag14',
 'Tavg_lag21',
 'Tavg_lag28',
 'Wind_AvgSpeed_lag7',
 'Wind_AvgSpeed_lag14',
 'Wind_AvgSpeed_lag21',
 'Wind_AvgSpeed_lag28',
 'PrecipTotal_lag7',
 'PrecipTotal_lag14',
 'PrecipTotal_lag21',
 'PrecipTotal_lag28',
 'DaylightMinutes_lag7',
 'DaylightMinutes_lag14',
 'DaylightMinutes_lag21',
 'DaylightMinutes_lag28',
 'RelHumidity_lag7',
 'RelHumidity_lag14',
 'RelHumidity_lag21',
 'RelHumidity_lag28',
 'T

## Scale Numerical Data

In [13]:
#Scale Numerical Data

#Fit the scaler to the training data
scaler = preprocessing.StandardScaler()
scaler.fit(X_train[continuous_columns])

#Transform the training data
scaled = pd.DataFrame(scaler.transform(X_train[continuous_columns]))
scaled.columns = continuous_columns
scaled.index = X_train.index
X_train = pd.concat([scaled, X_train.drop(columns=continuous_columns)], axis=1)

#Transform the test data
scaled = pd.DataFrame(scaler.transform(X_test[continuous_columns]))
scaled.columns = continuous_columns
scaled.index = X_test.index
X_test = pd.concat([scaled, X_test.drop(columns=continuous_columns)], axis=1)


In [14]:
pd.set_option('display.max_rows', None)
X_train.describe().T.sort_values(by='max',ascending=False)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Prev_Check,7354.0,-4.076454e-16,1.000068,-1.711834,-0.467674,-0.052954,0.154406,14.254891
PrecipTotal,7354.0,5.981376e-17,1.000068,-0.41496,-0.41496,-0.41496,-0.152834,12.593013
PrecipTotal_lag7,7354.0,7.757144000000001e-17,1.000068,-0.841298,-0.756983,-0.298523,0.344376,8.802185
PrecipTotal_lag28,7354.0,-1.607476e-16,1.000068,-1.016283,-0.707357,-0.147099,0.151355,8.565684
PrecipTotal_lag21,7354.0,-1.843327e-17,1.000068,-0.879817,-0.69428,-0.194362,0.254019,8.551642
PrecipTotal_lag14,7354.0,-1.328146e-16,1.000068,-0.884963,-0.698288,-0.229078,0.315811,8.347881
PrecipTotal_7days,7354.0,-2.303932e-16,1.000068,-0.725092,-0.588215,-0.322065,0.111379,6.232827
HeatDegDay,7354.0,-5.677929e-16,1.000068,-0.327328,-0.327328,-0.327328,-0.327328,5.268138
PrecipTotal_14days,7354.0,-3.060133e-17,1.000068,-1.002606,-0.658221,-0.371234,0.263725,5.167618
PrecipWeekly,7354.0,-2.022621e-16,1.000068,-0.946672,-0.633001,-0.319329,0.176474,4.446452


## Multicollinearity

In [15]:
df_VIF = pd.DataFrame({'Feature':X_train.columns})
vif_values = [variance_inflation_factor(X_train.values, i) for i in range(len(X_train.columns))]

  vif = 1. / (1. - r_squared_i)


In [16]:
df_VIF['VIF'] = vif_values
df_VIF.sort_values(by='VIF', ascending=False)

Unnamed: 0,Feature,VIF
34,Tavg_lag7,inf
32,RelHumidity_21days,inf
118,Week_40,inf
35,Tavg_lag14,inf
36,Tavg_lag21,inf
104,Week_26,inf
38,Wind_AvgSpeed_lag7,inf
39,Wind_AvgSpeed_lag14,inf
40,Wind_AvgSpeed_lag21,inf
105,Week_27,inf


## Save The Data

In [17]:
#Save the training set
datapath = '../data'
save_file(X_train, 'X_train.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)y
Writing file.  "../data\X_train.csv"


In [18]:
#Save the training labels
datapath = '../data'
save_file(y_train, 'y_train.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)y
Writing file.  "../data\y_train.csv"


In [19]:
#Save the test set
datapath = '../data'
save_file(X_test, 'X_test.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)y
Writing file.  "../data\X_test.csv"


In [20]:
#Save the test labels
datapath = '../data'
save_file(y_test, 'y_test.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)y
Writing file.  "../data\y_test.csv"
