## Imports

In [7]:

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib

from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

## Global Values

In [8]:
NUMBER_TORNADOS = 111 #Number of Tornado Events
NUMBER_RANDOM = 110 #Number of Random Events
NUMBER_DAYS = 14 #Number of days of data per Tornado/Event (Max: 57)
FILE_NAME = "/Users/abdullaahrobins/Documents/GitHub/CS506Spring2021Repository/NaturalDisasterProject/Deliverable_3/all_weather.csv" #CSV file that contains the data

## Loading Data

In [9]:

#Field 0: 'datatime'
#Field 1: 'temperature'
#Field 2: 'windspeed'
#Field 3: 'surface solar radiation' -Alvaro
#Field 4: 'relative humidity' -Abdullah
#Field 5: 'surface pressure' -Frazier
#Field 6: 'total precipitation' -Simon
#Field 7: 'city'
#Field 8: 'event_id'
#Field 9: 'latitude'
#Field 10: 'longitude'
#Field 11: 'outcome'

data = pd.read_csv (FILE_NAME)
df = data.drop(columns=['Unnamed: 0'])
event_ids = set(df['event_id'].to_numpy())

## Create Timeserie Fetaures

In [10]:
# Determine slope of each weather attribute over time for each Tornado/Event
temp_change = []
wind_change = []
rain_change = []
radiation_change = []
pressure_change = []
humidity_change = []
x = list(range(NUMBER_DAYS))

for i in event_ids:
    event = df.loc[df['event_id'] == i]
    temp = event['temperature']
    wind = event['wind_speed']
    radiation = event['surface_solar_radiation']
    humidity = event['relative_humidity']
    pressure = event['surface_pressure']
    rain = event['total_precipitation']

    t_slope, intercept, r_value, p_value, std_err = stats.linregress(x, temp.tail(NUMBER_DAYS))
    w_slope, intercept, r_value, p_value, std_err = stats.linregress(x, wind.tail(NUMBER_DAYS))
    r_slope, intercept, r_value, p_value, std_err = stats.linregress(x, rain.tail(NUMBER_DAYS))
    rd_slope, intercept, r_value, p_value, std_err = stats.linregress(x, radiation.tail(NUMBER_DAYS))
    p_slope, intercept, r_value, p_value, std_err = stats.linregress(x, pressure.tail(NUMBER_DAYS))
    h_slope, intercept, r_value, p_value, std_err = stats.linregress(x, humidity.tail(NUMBER_DAYS))

    temp_change.append(t_slope)
    wind_change.append(w_slope)
    rain_change.append(r_slope)
    radiation_change.append(rd_slope)
    pressure_change.append(p_slope)
    humidity_change.append(h_slope)


## Dataframe with mean values of each Tornado/Event

In [11]:
averages = df.groupby('event_id').tail(NUMBER_DAYS+7)
averages = averages.groupby('event_id').mean()

In [24]:
averages

Unnamed: 0_level_0,temperature,wind_speed,surface_solar_radiation,relative_humidity,surface_pressure,total_precipitation,latitude,longitude,outcome,t_slope,w_slope,r_slope,rd_slope,p_slope,h_slope
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,17.162857,4.305238,196.000000,0.741429,100111.523810,0.087619,,,0.0,0.130132,0.057209,0.001319,-5.514286,-52.676923,0.004681
1,-1.370476,3.534762,111.952381,0.677143,98104.285714,0.052381,,,0.0,1.525736,0.005846,0.022593,-5.868132,-98.468132,0.015231
2,27.081905,3.657619,294.333333,0.599524,97206.952381,0.042381,,,0.0,0.356681,0.065714,-0.000505,3.881319,9.841758,-0.015319
3,5.958095,2.959524,70.857143,0.744762,99537.619048,0.172857,,,0.0,-0.250615,-0.015099,0.012374,-4.498901,94.314286,0.012154
4,10.205714,3.782857,199.809524,0.564286,98089.809524,0.319524,,,0.0,0.293363,0.027143,0.169978,-7.276923,-133.791209,0.023736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899721,19.664762,2.309524,217.571429,0.674762,100453.428571,0.260000,31.0750,-89.9209,1.0,-0.096154,0.005297,0.021604,-12.679121,-112.210989,0.008549
902831,21.298571,2.420000,206.238095,0.664762,100154.190476,0.401429,31.4641,-89.7579,1.0,0.604945,0.158132,0.073780,-3.756044,-41.553846,0.013714
902837,21.240952,2.400952,205.333333,0.660476,100155.380952,0.364762,31.5475,-89.6194,1.0,0.484879,0.070835,0.040967,-2.516484,-93.907692,0.013055
902843,21.176667,2.375238,204.428571,0.653333,100196.761905,0.299524,31.6661,-89.4004,1.0,0.176879,0.326813,0.354989,2.512088,-54.837363,-0.006440


## Add Slope Features

In [12]:
averages['t_slope'] = temp_change
averages['w_slope'] = wind_change
averages['r_slope'] = rain_change
averages['rd_slope'] = radiation_change
averages['p_slope'] = pressure_change
averages['h_slope'] = humidity_change

## Model Prep

In [13]:
X = averages[['temperature','wind_speed','surface_solar_radiation','relative_humidity', 'total_precipitation', 't_slope', 'w_slope', 'r_slope', 'rd_slope', 'p_slope', 'h_slope']]
y = averages['outcome']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Train Model with Logistic Regression

In [30]:
logisticRegr = LogisticRegression(max_iter=10000)
logisticRegr.fit(x_train, y_train)
predictions = logisticRegr.predict(x_test)
score = logisticRegr.score(x_test, y_test)
print("Model Accuracy: ",score)

Model Accuracy:  0.8888888888888888


In [15]:
print("regression coefficients: \n", logisticRegr.coef_)


regression coefficients: 
 [[ 0.07573569  0.61778142  0.00266155 -0.00757722  2.53907333  1.18770639
   1.53761794  1.28692345 -0.06994059 -0.0085332   0.1390187 ]]


## Determine Individual Feature Strength

In [16]:
# Returns array with model accuracy based on individial feature
scores = []
for i in range(11):

    logisticRegr = LogisticRegression()
    logisticRegr.fit(x_train.to_numpy()[:,i].reshape(-1,1), y_train)
    score = logisticRegr.score(x_test.to_numpy()[:,i].reshape(-1,1), y_test)
    scores.append(round(score,2))
print(scores)

[0.71, 0.49, 0.69, 0.62, 0.84, 0.87, 0.78, 0.87, 0.69, 0.82, 0.49]


## Train Model with Naive Bayes

In [17]:
gnb = GaussianNB()
gnb.fit(x_train, y_train)
predictions = gnb.predict(x_test)
score = gnb.score(x_test, y_test)
print("Model Accuracy: ",score)

Model Accuracy:  0.9333333333333333


## Testing

In [19]:
test_data = pd.read_csv ('historical_data_2.csv')
test_data.columns = ['datetime', 'temperature', 'wind_speed', 'surface_solar_radiation', 'relative_humidity', 'surface_pressure', 'total_precipitation', 'latitude', 'longitude', 'event_id']
event_ids = set(test_data['event_id'].to_numpy())

In [20]:
test_averages = test_data.groupby('event_id').tail(NUMBER_DAYS+7)
test_averages = test_averages.groupby('event_id').mean()

In [25]:


# Determine slope of each weather attribute over time for each Tornado/Event
temp_change = []
wind_change = []
rain_change = []
radiation_change = []
pressure_change = []
humidity_change = []
x = list(range(NUMBER_DAYS))

for i in event_ids:
    event = test_data.loc[test_data['event_id'] == i]
    temp = event['temperature']
    wind = event['wind_speed']
    radiation = event['surface_solar_radiation']
    humidity = event['relative_humidity']
    pressure = event['surface_pressure']
    rain = event['total_precipitation']

    t_slope, intercept, r_value, p_value, std_err = stats.linregress(x, temp.tail(NUMBER_DAYS))
    w_slope, intercept, r_value, p_value, std_err = stats.linregress(x, wind.tail(NUMBER_DAYS))
    r_slope, intercept, r_value, p_value, std_err = stats.linregress(x, rain.tail(NUMBER_DAYS))
    rd_slope, intercept, r_value, p_value, std_err = stats.linregress(x, radiation.tail(NUMBER_DAYS))
    p_slope, intercept, r_value, p_value, std_err = stats.linregress(x, pressure.tail(NUMBER_DAYS))
    h_slope, intercept, r_value, p_value, std_err = stats.linregress(x, humidity.tail(NUMBER_DAYS))

    temp_change.append(t_slope)
    wind_change.append(w_slope)
    rain_change.append(r_slope)
    radiation_change.append(rd_slope)
    pressure_change.append(p_slope)
    humidity_change.append(h_slope)


In [26]:
test_averages['t_slope'] = temp_change
test_averages['w_slope'] = wind_change
test_averages['r_slope'] = rain_change
test_averages['rd_slope'] = radiation_change
test_averages['p_slope'] = pressure_change
test_averages['h_slope'] = humidity_change

In [27]:
test_averages

Unnamed: 0_level_0,temperature,wind_speed,surface_solar_radiation,relative_humidity,surface_pressure,total_precipitation,latitude,longitude,t_slope,w_slope,r_slope,rd_slope,p_slope,h_slope
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
203448,5.507143,2.455238,112.952381,0.691905,101282.675714,0.300000,32.4869,-94.1689,0.575297,0.013473,0.013978,-7.630769,-47.839275,0.029956
203449,5.455238,2.483810,112.523810,0.693333,101349.630000,0.290000,32.5182,-94.0432,0.915736,0.198967,0.274088,-8.769231,-137.808549,0.014725
218501,11.898095,2.414286,173.952381,0.649524,98169.105238,0.327143,36.0049,-80.0396,0.935560,-0.361187,0.084484,1.534066,-60.043736,-0.000066
219252,17.779048,2.793333,227.666667,0.572381,98247.882381,0.128095,35.4603,-92.7155,0.958681,-0.339033,0.110791,1.021978,-56.355868,-0.002901
219961,18.789524,3.106667,232.238095,0.561429,100469.000476,0.399048,35.4460,-91.2711,0.999165,-0.330615,0.169802,1.028571,-52.653780,-0.005648
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
675354,15.397143,2.918571,129.476190,0.681429,101186.580000,0.172381,30.6210,-90.9030,0.852088,0.330747,-0.008484,-3.481319,-116.386813,0.024220
676496,14.755714,2.882381,103.428571,0.784762,100997.754286,0.153333,31.2738,-89.3478,0.566879,0.149868,0.162418,-3.347253,-34.329736,0.014615
677489,10.656190,3.286667,136.047619,0.624762,99681.615714,0.028095,37.8371,-89.6619,0.017077,-0.013429,-0.000637,-6.582418,81.201912,0.008396
677492,10.712857,3.430476,132.761905,0.619048,99857.850476,0.033333,37.9392,-89.1503,0.616769,-0.015077,0.260549,1.446154,11.564615,0.003253


In [29]:
X_test = test_averages[['temperature','wind_speed','surface_solar_radiation','relative_humidity', 'total_precipitation', 't_slope', 'w_slope', 'r_slope', 'rd_slope', 'p_slope', 'h_slope']]


In [31]:
predictions0 = gnb.predict(X_test)
predictions1 = logisticRegr.predict(X_test)


In [35]:
sum(predictions1)/ len(predictions1)

0.7162162162162162