In [1]:
import pandas as pd
from sklearn import preprocessing, model_selection
from library.sb_utils import save_file

In [2]:
#read trap data into pandas
filepath = '../data/data_cleaned.csv'
df = pd.read_csv(filepath)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8169 entries, 0 to 8168
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Species        8169 non-null   object 
 1   Latitude       8169 non-null   float64
 2   Longitude      8169 non-null   float64
 3   NumMosquitos   8169 non-null   float64
 4   NumTrapGroups  8169 non-null   int64  
 5   WnvPresent     8169 non-null   int64  
 6   Month          8169 non-null   int64  
 7   DailyPrecip    8169 non-null   float64
 8   Wind_AvgSpeed  8169 non-null   float64
 9   IsSprayed      8169 non-null   int64  
 10  Tavg           8169 non-null   float64
 11  AnnualPrecip   8169 non-null   float64
dtypes: float64(7), int64(4), object(1)
memory usage: 766.0+ KB


## Train/Test Split

In [3]:
#separate the data into feature matrix (X) and target vector (y)
X = df.drop(columns='WnvPresent')
y = df['WnvPresent']

In [4]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.7)

## Encode Categorical Features

In [5]:
#Encode Species and Month columns
columns = ['Species','Month']

#Fit the encoder to the training data
encoder = preprocessing.OneHotEncoder(sparse=False, drop='first')
encoder.fit(X_train[columns])

#Transform the training data
encoded_columns = pd.DataFrame(encoder.transform(X_train[columns]))
encoded_columns.columns = encoder.get_feature_names(columns)
encoded_columns.index = X_train.index
X_train =  pd.concat([X_train,encoded_columns], axis=1).drop(columns=columns)

#Transform the test data
encoded_columns = pd.DataFrame(encoder.transform(X_test[columns]))
encoded_columns.columns = encoder.get_feature_names(columns)
encoded_columns.index = X_test.index
X_test = pd.concat([X_test,encoded_columns], axis=1).drop(columns=columns)

## Scale Numerical Data

In [6]:
#Scale Numerical Data
columns = ['Latitude','Longitude','NumMosquitos','NumTrapGroups','DailyPrecip',
          'Wind_AvgSpeed','IsSprayed','Tavg','AnnualPrecip']

#Fit the scaler to the training data
scaler = preprocessing.StandardScaler()
scaler.fit(X_train[columns])

#Transform the training data
scaled = pd.DataFrame(scaler.transform(X_train[columns]))
scaled.columns = columns
scaled.index = X_train.index
X_train = pd.concat([scaled, X_train.drop(columns=columns)], axis=1)

#Transform the test data
scaled = pd.DataFrame(scaler.transform(X_test[columns]))
scaled.columns = columns
scaled.index = X_test.index
X_test = pd.concat([scaled, X_test.drop(columns=columns)], axis=1)


In [7]:
#Save the training set
datapath = '../data'
save_file(X_train, 'X_train.csv', datapath)

Writing file.  "../data\X_train.csv"


In [8]:
#Save the training labels
datapath = '../data'
save_file(y_train, 'y_train.csv', datapath)

Writing file.  "../data\y_train.csv"


In [9]:
#Save the test set
datapath = '../data'
save_file(X_test, 'X_test.csv', datapath)

Writing file.  "../data\X_test.csv"


In [10]:
#Save the test labels
datapath = '../data'
save_file(y_test, 'y_test.csv', datapath)

Writing file.  "../data\y_test.csv"
