## Imports

In [11]:

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib

from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

## Global Values

In [12]:
NUMBER_TORNADOS = 111 #Number of Tornado Events
NUMBER_RANDOM = 110 #Number of Random Events
NUMBER_DAYS = 14 #Number of days of data per Tornado/Event (Max: 57)
FILE_NAME = "all_data.csv" #CSV file that contains the data

## Loading Data

In [13]:

#Field 0: 'datatime'
#Field 1: 'temperature'
#Field 2: 'windspeed'
#Field 3: 'surface solar radiation' -Alvaro
#Field 4: 'relative humidity' -Abdullah
#Field 5: 'surface pressure' -Frazier
#Field 6: 'total precipitation' -Simon
#Field 7: 'city'
#Field 8: 'event_id'
#Field 9: 'latitude'
#Field 10: 'longitude'
#Field 11: 'outcome'

data = pd.read_csv (FILE_NAME)
df = data.drop(columns=['Unnamed: 0'])
event_ids = set(df['event_id'].to_numpy())

## Create Timeserie Fetaures

In [14]:
# Determine slope of each weather attribute over time for each Tornado/Event
temp_change = []
wind_change = []
rain_change = []
radiation_change = []
pressure_change = []
humidity_change = []
x = list(range(NUMBER_DAYS))

for i in event_ids:
    event = df.loc[df['event_id'] == i]
    temp = event['temperature']
    wind = event['wind_speed']
    radiation = event['surface_solar_radiation']
    humidity = event['relative_humidity']
    pressure = event['surface_pressure']
    rain = event['total_precipitation']

    t_slope, intercept, r_value, p_value, std_err = stats.linregress(x, temp[-21:-7])
    w_slope, intercept, r_value, p_value, std_err = stats.linregress(x, wind[-21:-7])
    r_slope, intercept, r_value, p_value, std_err = stats.linregress(x, rain[-21:-7])
    rd_slope, intercept, r_value, p_value, std_err = stats.linregress(x, radiation[-21:-7])
    p_slope, intercept, r_value, p_value, std_err = stats.linregress(x, pressure[-21:-7])
    h_slope, intercept, r_value, p_value, std_err = stats.linregress(x, humidity[-21:-7])

    temp_change.append(t_slope)
    wind_change.append(w_slope)
    rain_change.append(r_slope)
    radiation_change.append(rd_slope)
    pressure_change.append(p_slope)
    humidity_change.append(h_slope)


## Dataframe with mean values of each Tornado/Event

In [16]:
t1 = df.groupby('event_id').tail(NUMBER_DAYS+7)
t2 = df.groupby('event_id').tail(7)
cond = t1.index.isin(t2.index)
t1.drop(t1[cond].index, inplace = True)
averages = t1.groupby('event_id').mean()


## Add Slope Features

In [17]:
averages['t_slope'] = temp_change
averages['w_slope'] = wind_change
averages['r_slope'] = rain_change
averages['rd_slope'] = radiation_change
averages['p_slope'] = pressure_change
averages['h_slope'] = humidity_change

In [18]:
type(averages)

pandas.core.frame.DataFrame

## Model Prep

In [20]:
X = averages[['temperature','wind_speed','surface_solar_radiation','relative_humidity', 'total_precipitation', 't_slope', 'w_slope', 'r_slope', 'rd_slope', 'p_slope', 'h_slope']]
y = averages['outcome']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0)

## Train Model with Logistic Regression

In [21]:
logisticRegr = LogisticRegression(max_iter=10000)
logisticRegr.fit(x_train, y_train)
predictions = logisticRegr.predict(x_test)
score = logisticRegr.score(x_test, y_test)
print("Model Accuracy: ",score)

Model Accuracy:  0.8387096774193549


In [20]:
print("regression coefficients: \n", logisticRegr.coef_)


regression coefficients: 
 [[-0.03495819 -0.35864962  0.01046594 -1.65658159  1.69566366 -0.28218309
  -0.93126735  0.08048918 -0.01537305  0.00232122  0.08539507]]


## Determine Individual Feature Strength

In [25]:
# Returns array with model accuracy based on individial feature
scores = []
for i in range(11):

    logisticRegr = LogisticRegression()
    logisticRegr.fit(x_train.to_numpy()[:,i].reshape(-1,1), y_train)
    score = logisticRegr.score(x_test.to_numpy()[:,i].reshape(-1,1), y_test)
    scores.append(round(score,2))
print(scores)

[0.87, 0.77, 0.9, 0.77, 0.77, 0.77, 0.77, 0.77, 0.77, 0.77, 0.77]


## Train Model with Naive Bayes

In [26]:
gnb = GaussianNB()
gnb.fit(x_train, y_train)
predictions = gnb.predict(x_test)
score = gnb.score(x_test, y_test)
print("Model Accuracy: ",score)

Model Accuracy:  0.8064516129032258


## Testing