In [527]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib

from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [528]:

FILE_NAME = "all_new_data.csv" #CSV file that contains the data
NUMBER_OF_ROWS = 296

## Processing Random Weather Data and Tornado Data

In [529]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', NUMBER_OF_ROWS)
pd.set_option('display.min_rows', 20)

In [530]:
df = pd.read_csv(FILE_NAME)

In [531]:
df.columns

Index(['Unnamed: 0', 'datetime', 'temperature', 'wind_speed',
       'surface_solar_radiation', 'relative_humidity', 'surface_pressure',
       'total_precipitation', 'latitude', 'longitude', 'event_id', 'outcome',
       'city'],
      dtype='object')

In [532]:
#df = data.drop(columns=['Unnamed: 0'])
# Get a seto of the event Id's
event_ids = set(df['event_id'].to_numpy())

In [537]:
## Produces a list of dfs
# Each df has the rolling means of the fundamental features
labels = []
list_of_dfs = []
#Use IDs to loop over events 
for i in event_ids:
    # We initialize a df per event
    event_df = pd.DataFrame()
    # Get data for that event 
    event = df.loc[df['event_id'] == i]
    # Exract the label 
    label = event['outcome'].iloc[0]
    # Record the label of the df: 0 or 1
    labels.append(label)
    # Save the dates
    temp = event['datetime']
    # We only want the features we are processing
    fundamental_features = event.drop(columns=[ 'latitude', 'longitude', 'event_id','outcome', 'Unnamed: 0'], axis = 1)
    # Compute the rolling mean
    event_df = fundamental_features.expanding().mean()  
    # Put the date
    #event_df['Date'] = temp 
    # We want 3 weeks of the last 3 weeks of data
    event_df = event_df.tail(21)
    # We want the last 14 days of this
    event_df = event_df.head(14)
    # Add to list
    list_of_dfs.append(event_df)

In [538]:
list_of_dfs[0]

Unnamed: 0,temperature,wind_speed,surface_solar_radiation,relative_humidity,surface_pressure,total_precipitation
16908,13.474054,3.458378,193.351351,0.75,100303.218649,0.097838
16909,13.443421,3.545,192.947368,0.751053,100246.238158,0.114474
16910,13.447692,3.614359,193.128205,0.749231,100230.75,0.114103
16911,13.47275,3.68625,191.15,0.75225,100209.90675,0.11475
16912,13.509512,3.7,191.634146,0.752439,100197.407317,0.112927
16913,13.568095,3.755238,189.119048,0.754524,100194.999524,0.112143
16914,13.616279,3.82093,190.27907,0.753953,100189.02,0.11
16915,13.639091,3.859091,189.840909,0.753182,100192.401364,0.107955
16916,13.679333,3.823333,189.066667,0.753778,100202.954,0.106222
16917,13.782609,3.796957,190.630435,0.752391,100206.658043,0.10413


In [539]:
len(list_of_dfs)

494

In [540]:
len(list_of_dfs[0])

14

In [559]:
def generateRollingAvgFeatures(df):
    ''' Turns a whole data frame into a line of rolling average features
    '''
    all_lists = []
    #Iterate over ever column
    for column in df.columns[:-1]:
        # Processed list
        # Turn the column of data into a list
        processing_list = df[str(column)].tolist()
        #Loop over the list 
        # Initialize a list to store the 7 processed values 
        processed_list = []
        for i in range(len(processing_list)):
            # Gets the right spots for computing
            if (((i + 1) % 3) == 0):
                feature_point = (processing_list[i] + processing_list[i -1] + processing_list[i-2]) / 3
                processed_list.append(feature_point)
                
        # Here we normalize the list of siumilar data to set up a pattern
        norm = [float(i)/sum(processed_list) for i in processed_list]
        all_lists += norm
    return all_lists 

In [560]:
# Loop over each Df
processing_data = list_of_dfs.copy()
flag = 0
df = pd.DataFrame()
list_of_lists = []
for df in processing_data:
    if (1):
        #label = getEventType()
        # This will return a row of features for each event 
        x = df
        # TODO:Should this return a list that then becomes a row in the dataframe?
        current = generateRollingAvgFeatures(df)
        list_of_lists.append(current)
        flag +=1
        # We need to add this df to the 

# Pass the DF
# For each DF run computations and generate columns -> Features

In [561]:
current

[0.2569755022352547,
 0.25388317807327515,
 0.24801345877409153,
 0.2411278609173785,
 0.23934891467808264,
 0.24982097751620877,
 0.2525179090777166,
 0.258312198727992,
 0.26254342002513426,
 0.25210624164103246,
 0.24498965797458183,
 0.2403606803592514,
 0.2462597474917572,
 0.25029457063990074,
 0.25147886778620504,
 0.25196681408213706,
 0.2500541389046858,
 0.24989746394743304,
 0.2499893135382105,
 0.25005908360967066]

In [562]:
len(current)

20

In [563]:
len(list_of_lists)

494

In [564]:
nums = range(4)
cols = [f'{v}_{i}' for v in df.columns[:-1] for i in nums]

In [565]:
processed_data = pd.DataFrame(list_of_lists, columns=cols)
# Replace NAN's by 0
processed_data.fillna(0)

Unnamed: 0,temperature_0,temperature_1,temperature_2,temperature_3,wind_speed_0,wind_speed_1,wind_speed_2,wind_speed_3,surface_solar_radiation_0,surface_solar_radiation_1,surface_solar_radiation_2,surface_solar_radiation_3,relative_humidity_0,relative_humidity_1,relative_humidity_2,relative_humidity_3,surface_pressure_0,surface_pressure_1,surface_pressure_2,surface_pressure_3
0,0.247059,0.248193,0.250545,0.254203,0.237609,0.249329,0.257427,0.255635,0.252496,0.249218,0.248034,0.250252,0.249370,0.250360,0.250548,0.249721,0.250108,0.249960,0.249945,0.249987
1,0.248389,0.250853,0.252399,0.248359,0.246724,0.250328,0.250599,0.252349,0.235229,0.246926,0.253674,0.264171,0.254025,0.249891,0.248971,0.247113,0.249972,0.250065,0.249990,0.249973
2,0.248714,0.249002,0.250939,0.251345,0.247649,0.247772,0.255132,0.249447,0.249055,0.249366,0.249887,0.251692,0.249090,0.250832,0.250991,0.249087,0.249993,0.250001,0.249968,0.250037
3,0.243703,0.251550,0.251349,0.253399,0.240673,0.248020,0.255639,0.255668,0.245685,0.247398,0.251042,0.255875,0.250275,0.250446,0.249941,0.249338,0.249964,0.249990,0.249988,0.250058
4,0.258789,0.254504,0.246219,0.240488,0.243934,0.245430,0.253919,0.256717,0.254875,0.253048,0.249837,0.242241,0.250310,0.250530,0.249444,0.249716,0.250090,0.250075,0.249949,0.249887
5,0.231652,0.240903,0.258666,0.268779,0.247090,0.247964,0.252977,0.251969,0.238568,0.244325,0.253356,0.263752,0.252222,0.252555,0.250511,0.244712,0.250000,0.249991,0.249950,0.250059
6,1.723502,0.491503,-0.562400,-0.652605,0.248844,0.252165,0.250756,0.248235,0.244923,0.249283,0.251675,0.254120,0.251294,0.249193,0.248779,0.250734,0.250080,0.250054,0.249965,0.249902
7,0.249758,0.249307,0.249591,0.251343,0.256248,0.248259,0.247485,0.248008,0.254356,0.253813,0.247564,0.244267,0.249186,0.248947,0.250115,0.251752,0.249786,0.249975,0.250126,0.250113
8,0.240292,0.241492,0.250167,0.268049,0.247432,0.255925,0.252438,0.244205,0.245143,0.244358,0.250825,0.259674,0.251473,0.251305,0.249400,0.247822,0.249992,0.250023,0.250016,0.249970
9,0.251453,0.250905,0.249471,0.248171,0.241662,0.246976,0.251116,0.260246,0.250305,0.248481,0.250527,0.250687,0.251416,0.253311,0.249338,0.245934,0.250133,0.250084,0.249942,0.249840


In [566]:
len(processed_data)

494

In [567]:
ones = 0 
zeros = 0
for i in range(len(labels)):
    if (labels[i] == 0 ):
        zeros+=1
    else:
        ones +=1
        
print(ones)
print(zeros)

296
198


In [568]:
processed_data['label'] = labels

In [569]:
processed_data_shuffled = processed_data.sample(frac=1)

In [570]:
processed_data_shuffled

Unnamed: 0,temperature_0,temperature_1,temperature_2,temperature_3,wind_speed_0,wind_speed_1,wind_speed_2,wind_speed_3,surface_solar_radiation_0,surface_solar_radiation_1,surface_solar_radiation_2,surface_solar_radiation_3,relative_humidity_0,relative_humidity_1,relative_humidity_2,relative_humidity_3,surface_pressure_0,surface_pressure_1,surface_pressure_2,surface_pressure_3,label
45,0.300411,0.267761,0.229212,0.202617,0.248536,0.250392,0.249031,0.252041,0.254800,0.250260,0.246737,0.248204,0.250014,0.250488,0.250385,0.249113,0.249821,0.249921,0.250069,0.250189,0.0
276,0.244851,0.249876,0.250053,0.255219,0.246089,0.249583,0.250982,0.253345,0.245632,0.247601,0.252413,0.254354,0.251993,0.250671,0.249204,0.248132,0.250232,0.250019,0.249972,0.249778,1.0
355,0.236444,0.245770,0.255741,0.262045,0.252074,0.252107,0.250906,0.244913,0.248373,0.249373,0.249952,0.252302,0.248894,0.248128,0.251327,0.251651,0.250091,0.250070,0.249961,0.249878,1.0
182,0.251805,0.250492,0.249393,0.248310,0.252602,0.251122,0.244897,0.251379,0.254011,0.250919,0.249184,0.245886,0.243792,0.249618,0.252159,0.254430,0.249956,0.249980,0.250031,0.250033,0.0
58,0.250618,0.251261,0.248755,0.249365,0.248874,0.245581,0.252100,0.253445,0.249199,0.249354,0.251363,0.250084,0.249748,0.250498,0.249584,0.250170,0.249938,0.249937,0.250063,0.250061,0.0
48,0.246739,0.249234,0.250456,0.253571,0.252806,0.250295,0.246268,0.250631,0.248743,0.249839,0.250025,0.251394,0.250603,0.249589,0.250726,0.249081,0.249948,0.249962,0.250025,0.250065,1.0
222,0.240833,0.243319,0.252948,0.262900,0.252771,0.252547,0.250442,0.244239,0.248276,0.247014,0.249942,0.254768,0.245246,0.248860,0.252425,0.253469,0.249975,0.249990,0.249998,0.250038,1.0
413,0.242490,0.246555,0.252789,0.258166,0.256646,0.249250,0.245650,0.248454,0.244875,0.246032,0.251420,0.257673,0.250649,0.251452,0.249726,0.248173,0.250013,0.250024,0.250020,0.249944,1.0
12,0.261560,0.253984,0.244316,0.240140,0.245901,0.243359,0.251950,0.258789,0.252959,0.251822,0.249746,0.245473,0.253596,0.251510,0.248090,0.246804,0.249893,0.249945,0.250076,0.250086,0.0
160,0.247804,0.250267,0.250805,0.251124,0.255373,0.252798,0.247813,0.244016,0.248608,0.251771,0.249636,0.249985,0.251745,0.247883,0.249194,0.251178,0.249985,0.249939,0.250023,0.250054,0.0


## Test Model

### Logistic Regression, no normalization

In [586]:
y = processed_data_shuffled['label']
X = processed_data_shuffled.drop(columns=[ 'label'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0)

In [587]:
logisticRegr = LogisticRegression(max_iter=10000)
logisticRegr.fit(x_train, y_train)
predictions = logisticRegr.predict(x_test)
score = logisticRegr.score(x_test, y_test)
print("Model Accuracy: ",score)

Model Accuracy:  0.68


In [588]:
print("regression coefficients: \n", logisticRegr.coef_)

regression coefficients: 
 [[-0.50835255 -0.16189889  0.15687956  0.51335472 -0.0328907   0.03017016
   0.01811818 -0.01541479 -0.41344248 -0.12599665  0.17830204  0.36111994
   0.01347084  0.01233384 -0.01618112 -0.0096407   0.00482674  0.00105364
  -0.00257058 -0.00332696]]


### Naive Bayes, no normalization

In [574]:
gnb = GaussianNB()
gnb.fit(x_train, y_train)
predictions = gnb.predict(x_test)
score = gnb.score(x_test, y_test)
print("Model Accuracy: ",score)

Model Accuracy:  0.68


## Helper Function: Computing Features from Ratios

In [575]:
# Takes input two column names of a dataframe and returns the ratio between then returns the ratio of them as a new figure in the dataframe inputed

def create_ratio_columns(numerator, denominator, df):
    ''' numerator, denominator are column names 
        df is the df where the columns are 
    ''' 
    # filter based on names
    f1 = df.filter(like=numerator)
    f2 = df.filter(like=denominator)
    # Init list for each column of ratio
    ratios = list()
    # Iterating through one set of column to find appropriate data
    for i, c in enumerate(f1.columns):
        col = f'{numerator}/{denominator}:{i}'
        r = f1.iloc[:, i].div(f2.iloc[:, i])
        r.name = col
        # ASS 
        ratios.append(r)  
    ratios = pd.concat(ratios, axis=1)
    
    return ratios
    

In [576]:
ratio_df = create_ratio_columns('temp', 'wind', processed_data)
ratio_df.head()

Unnamed: 0,temp/wind:0,temp/wind:1,temp/wind:2,temp/wind:3
0,1.039774,0.995442,0.973266,0.994398
1,1.00675,1.002097,1.007181,0.984189
2,1.004302,1.004965,0.983565,1.007607
3,1.012586,1.014232,0.983219,0.991125
4,1.0609,1.036971,0.969675,0.936781


In [577]:
new = pd.concat([processed_data, ratio_df], axis=1)

In [578]:
new

Unnamed: 0,temperature_0,temperature_1,temperature_2,temperature_3,wind_speed_0,wind_speed_1,wind_speed_2,wind_speed_3,surface_solar_radiation_0,surface_solar_radiation_1,surface_solar_radiation_2,surface_solar_radiation_3,relative_humidity_0,relative_humidity_1,relative_humidity_2,relative_humidity_3,surface_pressure_0,surface_pressure_1,surface_pressure_2,surface_pressure_3,label,temp/wind:0,temp/wind:1,temp/wind:2,temp/wind:3
0,0.247059,0.248193,0.250545,0.254203,0.237609,0.249329,0.257427,0.255635,0.252496,0.249218,0.248034,0.250252,0.249370,0.250360,0.250548,0.249721,0.250108,0.249960,0.249945,0.249987,0.0,1.039774,0.995442,0.973266,0.994398
1,0.248389,0.250853,0.252399,0.248359,0.246724,0.250328,0.250599,0.252349,0.235229,0.246926,0.253674,0.264171,0.254025,0.249891,0.248971,0.247113,0.249972,0.250065,0.249990,0.249973,0.0,1.006750,1.002097,1.007181,0.984189
2,0.248714,0.249002,0.250939,0.251345,0.247649,0.247772,0.255132,0.249447,0.249055,0.249366,0.249887,0.251692,0.249090,0.250832,0.250991,0.249087,0.249993,0.250001,0.249968,0.250037,0.0,1.004302,1.004965,0.983565,1.007607
3,0.243703,0.251550,0.251349,0.253399,0.240673,0.248020,0.255639,0.255668,0.245685,0.247398,0.251042,0.255875,0.250275,0.250446,0.249941,0.249338,0.249964,0.249990,0.249988,0.250058,1.0,1.012586,1.014232,0.983219,0.991125
4,0.258789,0.254504,0.246219,0.240488,0.243934,0.245430,0.253919,0.256717,0.254875,0.253048,0.249837,0.242241,0.250310,0.250530,0.249444,0.249716,0.250090,0.250075,0.249949,0.249887,0.0,1.060900,1.036971,0.969675,0.936781
5,0.231652,0.240903,0.258666,0.268779,0.247090,0.247964,0.252977,0.251969,0.238568,0.244325,0.253356,0.263752,0.252222,0.252555,0.250511,0.244712,0.250000,0.249991,0.249950,0.250059,0.0,0.937520,0.971523,1.022489,1.066715
6,1.723502,0.491503,-0.562400,-0.652605,0.248844,0.252165,0.250756,0.248235,0.244923,0.249283,0.251675,0.254120,0.251294,0.249193,0.248779,0.250734,0.250080,0.250054,0.249965,0.249902,0.0,6.926024,1.949130,-2.242821,-2.628983
7,0.249758,0.249307,0.249591,0.251343,0.256248,0.248259,0.247485,0.248008,0.254356,0.253813,0.247564,0.244267,0.249186,0.248947,0.250115,0.251752,0.249786,0.249975,0.250126,0.250113,0.0,0.974672,1.004224,1.008513,1.013446
8,0.240292,0.241492,0.250167,0.268049,0.247432,0.255925,0.252438,0.244205,0.245143,0.244358,0.250825,0.259674,0.251473,0.251305,0.249400,0.247822,0.249992,0.250023,0.250016,0.249970,0.0,0.971146,0.943603,0.991003,1.097639
9,0.251453,0.250905,0.249471,0.248171,0.241662,0.246976,0.251116,0.260246,0.250305,0.248481,0.250527,0.250687,0.251416,0.253311,0.249338,0.245934,0.250133,0.250084,0.249942,0.249840,0.0,1.040515,1.015907,0.993450,0.953602


In [341]:
## Plot data
## Take any 2 columns and plot how they relate to eachother in a line 
def create_line(df_column1, df_column2):
    ''' We want to be able to visualize the behavior to explore the features more 
    '''
    
    # Find how windspeed and humidity grow together with time. 

In [346]:
## Preparing Model

 ## Find 10 Non Tornadoes 

In [580]:
reduced_dataset = processed_data_shuffled[processed_data_shuffled['label']== 0]

In [589]:
y = reduced_dataset['label']
X = reduced_dataset.drop(columns=[ 'label'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0)

In [590]:
predictions = logisticRegr.predict(x_test)

In [591]:
predictions

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

## Drawing Board

In [342]:
dfb = pd.DataFrame()

In [52]:
type(dfb)

pandas.core.frame.DataFrame

In [53]:
dfb['b'] = [0, 1, 2, 3, 4]

In [54]:
dfb['c'] = [0, 1, 2, 3, 4]

In [55]:
dfb

Unnamed: 0,b,c
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4


In [56]:
dfb.expanding().mean()

Unnamed: 0,b,c
0,0.0,0.0
1,0.5,0.5
2,1.0,1.0
3,1.5,1.5
4,2.0,2.0
