## Imports

In [2]:

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib

from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

## Global Values

In [3]:
NUMBER_TORNADOS = 111 #Number of Tornado Events
NUMBER_RANDOM = 110 #Number of Random Events
NUMBER_DAYS = 14 #Number of days of data per Tornado/Event (Max: 57)
FILE_NAME = "all_weather.csv" #CSV file that contains the data

## Loading Data

In [4]:

#Field 0: 'datatime'
#Field 1: 'temperature'
#Field 2: 'windspeed'
#Field 3: 'surface solar radiation' -Alvaro
#Field 4: 'relative humidity' -Abdullah
#Field 5: 'surface pressure' -Frazier
#Field 6: 'total precipitation' -Simon
#Field 7: 'city'
#Field 8: 'event_id'
#Field 9: 'latitude'
#Field 10: 'longitude'
#Field 11: 'outcome'

data = pd.read_csv (FILE_NAME)
df = data.drop(columns=['Unnamed: 0'])
event_ids = set(df['event_id'].to_numpy())

In [19]:
df

Unnamed: 0,datetime,temperature,wind_speed,surface_solar_radiation,relative_humidity,surface_pressure,total_precipitation,city,event_id,latitude,longitude,outcome
0,2017-04-30,12.18,5.76,181,0.60,98916.0,0.03,Birmingham,0,,,0.0
1,2017-05-01,11.01,3.41,191,0.79,99275.0,0.21,Birmingham,0,,,0.0
2,2017-05-02,10.88,3.09,184,0.76,100772.0,0.03,Birmingham,0,,,0.0
3,2017-05-03,9.70,5.75,189,0.72,101186.0,0.00,Birmingham,0,,,0.0
4,2017-05-04,10.98,5.86,248,0.66,101182.0,0.00,Birmingham,0,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
12592,2020-07-04,26.76,3.30,331,0.61,97023.0,0.00,,898307,46.11,-95.9,1.0
12593,2020-07-05,24.55,1.61,233,0.69,97004.0,0.11,,898307,46.11,-95.9,1.0
12594,2020-07-06,24.38,1.65,287,0.68,96966.0,0.01,,898307,46.11,-95.9,1.0
12595,2020-07-07,25.59,3.91,267,0.68,96431.0,0.50,,898307,46.11,-95.9,1.0


In [None]:
# Determine slope of each weather attribute over time for each Tornado/Event
temp_change = []
wind_change = []
rain_change = []
radiation_change = []
pressure_change = []
humidity_change = []
x = list(range(NUMBER_DAYS))

for i in event_ids:
    event = df.loc[df['event_id'] == i]
    if (i == 0 ):
        print(event)

## Create Timeserie Fetaures

In [5]:
# Determine slope of each weather attribute over time for each Tornado/Event
temp_change = []
wind_change = []
rain_change = []
radiation_change = []
pressure_change = []
humidity_change = []
x = list(range(NUMBER_DAYS))

for i in event_ids:
    event = df.loc[df['event_id'] == i]
    temp = event['temperature']
    wind = event['wind_speed']
    radiation = event['surface_solar_radiation']
    humidity = event['relative_humidity']
    pressure = event['surface_pressure']
    rain = event['total_precipitation']

    t_slope, intercept, r_value, p_value, std_err = stats.linregress(x, temp.tail(NUMBER_DAYS))
    w_slope, intercept, r_value, p_value, std_err = stats.linregress(x, wind.tail(NUMBER_DAYS))
    r_slope, intercept, r_value, p_value, std_err = stats.linregress(x, rain.tail(NUMBER_DAYS))
    rd_slope, intercept, r_value, p_value, std_err = stats.linregress(x, radiation.tail(NUMBER_DAYS))
    p_slope, intercept, r_value, p_value, std_err = stats.linregress(x, pressure.tail(NUMBER_DAYS))
    h_slope, intercept, r_value, p_value, std_err = stats.linregress(x, humidity.tail(NUMBER_DAYS))

    temp_change.append(t_slope)
    wind_change.append(w_slope)
    rain_change.append(r_slope)
    radiation_change.append(rd_slope)
    pressure_change.append(p_slope)
    humidity_change.append(h_slope)


## Dataframe with mean values of each Tornado/Event

In [6]:
averages = df.groupby('event_id').tail(NUMBER_DAYS+7)
averages = averages.groupby('event_id').mean()

## Add Slope Features

In [7]:
averages['t_slope'] = temp_change
averages['w_slope'] = wind_change
averages['r_slope'] = rain_change
averages['rd_slope'] = radiation_change
averages['p_slope'] = pressure_change
averages['h_slope'] = humidity_change

## Model Prep

In [8]:
X = averages[['temperature','wind_speed','surface_solar_radiation','relative_humidity', 'total_precipitation', 't_slope', 'w_slope', 'r_slope', 'rd_slope', 'p_slope', 'h_slope']]
y = averages['outcome']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Train Model with Logistic Regression

In [9]:
logisticRegr = LogisticRegression(max_iter=10000)
logisticRegr.fit(x_train, y_train)
predictions = logisticRegr.predict(x_test)
score = logisticRegr.score(x_test, y_test)
print("Model Accuracy: ",score)

Model Accuracy:  0.8888888888888888


In [10]:
print("regression coefficients: \n", logisticRegr.coef_)


regression coefficients: 
 [[ 0.07529001  0.62336569  0.00274314 -0.00452992  2.54417718  1.16982022
   1.54257202  1.29218123 -0.06951692 -0.00859638  0.13963608]]


## Determine Individual Feature Strength

In [11]:
# Returns array with model accuracy based on individial feature
scores = []
for i in range(11):

    logisticRegr = LogisticRegression()
    logisticRegr.fit(x_train.to_numpy()[:,i].reshape(-1,1), y_train)
    score = logisticRegr.score(x_test.to_numpy()[:,i].reshape(-1,1), y_test)
    scores.append(round(score,2))
print(scores)

[0.71, 0.49, 0.69, 0.62, 0.84, 0.87, 0.78, 0.87, 0.69, 0.82, 0.49]
