## Imports

In [1]:

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib

from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

## Global Values

In [78]:
NUMBER_TORNADOS = 111 #Number of Tornado Events
NUMBER_RANDOM = 110 #Number of Random Events
NUMBER_DAYS = 14 #Number of days of data per Tornado/Event (Max: 57)
FILE_NAME = "all_data.csv" #CSV file that contains the data

## Loading Data

In [79]:

#Field 0: 'datatime'
#Field 1: 'temperature'
#Field 2: 'windspeed'
#Field 3: 'surface solar radiation' -Alvaro
#Field 4: 'relative humidity' -Abdullah
#Field 5: 'surface pressure' -Frazier
#Field 6: 'total precipitation' -Simon
#Field 7: 'city'
#Field 8: 'event_id'
#Field 9: 'latitude'
#Field 10: 'longitude'
#Field 11: 'outcome'

data = pd.read_csv (FILE_NAME)
df = data.drop(columns=['Unnamed: 0'])
event_ids = set(df['event_id'].to_numpy())

## Create Timeserie Fetaures

In [80]:
# Determine slope of each weather attribute over time for each Tornado/Event
temp_change = []
wind_change = []
rain_change = []
radiation_change = []
pressure_change = []
humidity_change = []
x = list(range(NUMBER_DAYS))

for i in event_ids:
    event = df.loc[df['event_id'] == i]
    temp = event['temperature']
    wind = event['wind_speed']
    radiation = event['surface_solar_radiation']
    humidity = event['relative_humidity']
    pressure = event['surface_pressure']
    rain = event['total_precipitation']

    t_slope, intercept, r_value, p_value, std_err = stats.linregress(x, temp[-21:-7])
    w_slope, intercept, r_value, p_value, std_err = stats.linregress(x, wind[-21:-7])
    r_slope, intercept, r_value, p_value, std_err = stats.linregress(x, rain[-21:-7])
    rd_slope, intercept, r_value, p_value, std_err = stats.linregress(x, radiation[-21:-7])
    p_slope, intercept, r_value, p_value, std_err = stats.linregress(x, pressure[-21:-7])
    h_slope, intercept, r_value, p_value, std_err = stats.linregress(x, humidity[-21:-7])

    temp_change.append(t_slope)
    wind_change.append(w_slope)
    rain_change.append(r_slope)
    radiation_change.append(rd_slope)
    pressure_change.append(p_slope)
    humidity_change.append(h_slope)


## Dataframe with mean values of each Tornado/Event

In [81]:
t1 = df.groupby('event_id').tail(NUMBER_DAYS+7)
t2 = df.groupby('event_id').tail(7)
cond = t1.index.isin(t2.index)
t1.drop(t1[cond].index, inplace = True)
averages = t1.groupby('event_id').mean()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


## Add Slope Features

In [82]:
averages['t_slope'] = temp_change
averages['w_slope'] = wind_change
averages['r_slope'] = rain_change
averages['rd_slope'] = radiation_change
averages['p_slope'] = pressure_change
averages['h_slope'] = humidity_change

In [83]:
averages

Unnamed: 0_level_0,temperature,wind_speed,surface_solar_radiation,relative_humidity,surface_pressure,total_precipitation,latitude,longitude,outcome,t_slope,w_slope,r_slope,rd_slope,p_slope,h_slope
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,16.062143,4.642857,197.428571,0.742143,100071.240000,0.120714,,,0.0,0.637868,-0.387692,-0.036857,12.821978,187.289495,-0.015275
1,-4.708571,3.628571,117.857143,0.652143,98396.950714,0.012857,,,0.0,0.794593,-0.027648,0.000044,1.164835,-60.263626,0.008857
2,25.928571,3.630714,278.285714,0.657857,97179.799286,0.057857,,,0.0,0.223868,0.137780,-0.018000,5.780220,53.467143,-0.015407
3,7.548571,3.083571,83.642857,0.731429,99236.604286,0.153571,,,0.0,0.444440,0.158308,0.025341,-2.951648,-116.959165,0.000440
4,10.167857,3.908571,205.285714,0.547857,98173.071429,0.022143,,,0.0,-0.071758,-0.035077,-0.000901,2.769231,91.087341,-0.023231
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899721,20.472143,2.072143,223.928571,0.669286,100406.642857,0.164286,31.0750,-89.9209,1.0,0.908615,-0.234440,-0.026769,15.931868,-5.463736,-0.010725
902831,21.210000,2.022143,223.785714,0.637857,100283.142857,0.025000,31.4641,-89.7579,1.0,-0.812989,-0.056462,0.028418,-16.883516,89.025495,0.017780
902837,21.170714,2.005714,222.785714,0.632857,100284.428571,0.025000,31.5475,-89.6194,1.0,-0.459582,0.003626,-0.026198,3.052747,69.540132,-0.015451
902843,21.121429,1.986429,221.357143,0.626429,100326.285714,0.027857,31.6661,-89.4004,1.0,-0.580835,-0.113143,0.000110,-7.826374,-8.503297,0.013033


## Model Prep

In [84]:
X = averages[['temperature','wind_speed','surface_solar_radiation','relative_humidity', 'total_precipitation', 't_slope', 'w_slope', 'r_slope', 'rd_slope', 'p_slope', 'h_slope']]
y = averages['outcome']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Train Model with Logistic Regression

In [85]:
logisticRegr = LogisticRegression(max_iter=10000)
logisticRegr.fit(x_train, y_train)
predictions = logisticRegr.predict(x_test)
score = logisticRegr.score(x_test, y_test)
print("Model Accuracy: ",score)

Model Accuracy:  0.7272727272727273


In [86]:
print("regression coefficients: \n", logisticRegr.coef_)


regression coefficients: 
 [[-0.03507505 -0.35890873  0.01047717 -1.66308192  1.70169945 -0.28426589
  -0.93042453  0.08051527 -0.01533377  0.00231602  0.08526796]]


## Determine Individual Feature Strength

In [87]:
# Returns array with model accuracy based on individial feature
scores = []
for i in range(11):

    logisticRegr = LogisticRegression()
    logisticRegr.fit(x_train.to_numpy()[:,i].reshape(-1,1), y_train)
    score = logisticRegr.score(x_test.to_numpy()[:,i].reshape(-1,1), y_test)
    scores.append(round(score,2))
print(scores)

[0.74, 0.69, 0.76, 0.69, 0.69, 0.69, 0.69, 0.69, 0.69, 0.69, 0.69]


## Train Model with Naive Bayes

In [88]:
gnb = GaussianNB()
gnb.fit(x_train, y_train)
predictions = gnb.predict(x_test)
score = gnb.score(x_test, y_test)
print("Model Accuracy: ",score)

Model Accuracy:  0.7520661157024794


## Testing