In [4]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib

from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [5]:

FILE_NAME = "all_new_data.csv" #CSV file that contains the data
NUMBER_OF_ROWS = 296

## Processing Random Weather Data and Tornado Data

In [6]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', NUMBER_OF_ROWS)
pd.set_option('display.min_rows', 20)

In [7]:
df = pd.read_csv(FILE_NAME)

In [8]:
df.columns

Index(['Unnamed: 0', 'datetime', 'temperature', 'wind_speed',
       'surface_solar_radiation', 'relative_humidity', 'surface_pressure',
       'total_precipitation', 'latitude', 'longitude', 'event_id', 'outcome',
       'city'],
      dtype='object')

In [9]:
#df = data.drop(columns=['Unnamed: 0'])
# Get a seto of the event Id's
event_ids = set(df['event_id'].to_numpy())

In [10]:
## Produces a list of dfs
# Each df has the rolling means of the fundamental features
labels = []
list_of_dfs = []
#Use IDs to loop over events 
for i in event_ids:
    # We initialize a df per event
    event_df = pd.DataFrame()
    # Get data for that event 
    event = df.loc[df['event_id'] == i]
    # Exract the label 
    label = event['outcome'].iloc[0]
    # Record the label of the df: 0 or 1
    labels.append(label)
    # Save the dates
#     temp = event['datetime']
    # We only want the features we are processing
    fundamental_features = event.drop(columns=[ 'latitude', 'longitude', 'event_id','outcome', 'Unnamed: 0'], axis = 1)
    # Compute the rolling mean
    event_df = fundamental_features.expanding().mean()  
    # Put the date
#     event_df['Date'] = temp 
    # We want 3 weeks of the last 3 weeks of data
    event_df = event_df.tail(21)
    # We want the last 14 days of this
    event_df = event_df.head(14)
    # Add to list
    list_of_dfs.append(event_df)

In [11]:
list_of_dfs[0]

Unnamed: 0,temperature,wind_speed,surface_solar_radiation,relative_humidity,surface_pressure,total_precipitation
16908,13.474054,3.458378,193.351351,0.75,100303.218649,0.097838
16909,13.443421,3.545,192.947368,0.751053,100246.238158,0.114474
16910,13.447692,3.614359,193.128205,0.749231,100230.75,0.114103
16911,13.47275,3.68625,191.15,0.75225,100209.90675,0.11475
16912,13.509512,3.7,191.634146,0.752439,100197.407317,0.112927
16913,13.568095,3.755238,189.119048,0.754524,100194.999524,0.112143
16914,13.616279,3.82093,190.27907,0.753953,100189.02,0.11
16915,13.639091,3.859091,189.840909,0.753182,100192.401364,0.107955
16916,13.679333,3.823333,189.066667,0.753778,100202.954,0.106222
16917,13.782609,3.796957,190.630435,0.752391,100206.658043,0.10413


In [12]:
len(list_of_dfs)

494

In [13]:
len(list_of_dfs[0])

14

In [79]:
def generateRollingAvgFeatures(df):
    ''' Turns a whole data frame into a line of rolling average features
    '''
    all_lists = []
    #Iterate over ever column
    for column in df.columns[:-1]:
        # Processed list
        # Turn the column of data into a list
        processing_list = df[str(column)].tolist()
        #Loop over the list 
        # Initialize a list to store the 7 processed values 
        processed_list = []
        for i in range(len(processing_list)):
            # Gets the right spots for computing
            if (((i + 1) % 3) == 0):
                feature_point = (processing_list[i] + processing_list[i -1] + processing_list[i-2]) / 3
                processed_list.append(feature_point)
                
        # Here we normalize the list of siumilar data to set up a pattern
        avg = np.average(processed_list)
        std = np.std(processed_list)
        norm = [(float(i) - avg) / std for i in processed_list]
        all_lists += norm
    return all_lists 

In [80]:
# Loop over each Df
processing_data = list_of_dfs.copy()
flag = 0
df = pd.DataFrame()
list_of_lists = []
for df in processing_data:
    if (1):
        #label = getEventType()
        # This will return a row of features for each event 
        x = df
        # TODO:Should this return a list that then becomes a row in the dataframe?
        current = generateRollingAvgFeatures(df)
        list_of_lists.append(current)
        flag +=1

In [81]:
current

[1.1530232174239068,
 0.641874136782381,
 -0.3283674893351349,
 -1.4665298648711642,
 -1.549873229198926,
 -0.02605012979116434,
 0.3663889411419507,
 1.2095344178481395,
 1.499778190295632,
 0.25183684117913646,
 -0.599071200724095,
 -1.152543830750685,
 -1.667212800281316,
 0.1313044882256524,
 0.6592034355036444,
 0.876704876552038,
 0.829240730109786,
 -1.5705354880495443,
 -0.16368357335718117,
 0.9049783312969394]

In [82]:
len(current)

20

In [83]:
len(list_of_lists)

494

In [84]:
nums = range(4)
cols = [f'{v}_{i}' for v in df.columns[:-1] for i in nums]

In [85]:
processed_data = pd.DataFrame(list_of_lists, columns=cols)
# Replace NAN's by 0
processed_data.fillna(0)

Unnamed: 0,temperature_0,temperature_1,temperature_2,temperature_3,wind_speed_0,wind_speed_1,wind_speed_2,wind_speed_3,surface_solar_radiation_0,surface_solar_radiation_1,surface_solar_radiation_2,surface_solar_radiation_3,relative_humidity_0,relative_humidity_1,relative_humidity_2,relative_humidity_3,surface_pressure_0,surface_pressure_1,surface_pressure_2,surface_pressure_3
0,-1.076066,-0.661297,0.199521,1.537842,-1.596692,-0.086413,0.957058,0.726047,1.521186,-0.476660,-1.198011,0.153484,-1.324223,0.756978,1.153259,-0.586014,1.683832,-0.624051,-0.856771,-0.203009
1,0.939129,-0.497244,-1.398473,0.956587,-1.602543,0.160400,0.293146,1.148997,-1.405262,-0.292440,0.349540,1.348162,1.590765,-0.042888,-0.406741,-1.141136,-0.724939,1.703510,-0.272817,-0.705754
2,-1.112887,-0.863835,0.812928,1.163795,-0.771625,-0.731295,1.684310,-0.181391,-0.925666,-0.620768,-0.110541,1.656976,-0.996492,0.911028,1.085176,-0.999713,-0.265811,0.057275,-1.292930,1.501465
3,-1.691682,0.416287,0.362380,0.913015,-1.499096,-0.318268,0.906362,0.911002,-1.104986,-0.666424,0.266861,1.504550,0.648743,1.054297,-0.138281,-1.564759,-1.012213,-0.298143,-0.351803,1.662159
4,1.235848,0.633330,-0.531690,-1.337489,-1.116156,-0.840759,0.721006,1.235909,1.009420,0.631141,-0.033828,-1.606732,0.708987,1.209636,-1.270408,-0.648215,1.054008,0.875013,-0.601978,-1.327042
5,-1.260756,-0.625114,0.595463,1.290407,-1.155826,-0.808791,1.182384,0.782233,-1.199623,-0.595534,0.352167,1.442990,0.705484,0.811097,0.162145,-1.678726,-0.004002,-0.223275,-1.287130,1.514407
6,1.531201,0.250959,-0.844212,-0.937949,-0.741676,1.389647,0.484984,-1.132955,-1.496036,-0.211382,0.493461,1.213957,1.239936,-0.773545,-1.170059,0.703668,1.123620,0.759341,-0.500965,-1.381996
7,-0.305406,-0.874564,-0.515840,1.695811,1.726882,-0.481264,-0.695195,-0.550424,1.024580,0.896696,-0.573002,-1.348274,-0.738983,-0.955892,0.104513,1.590362,-1.562138,-0.182797,0.916693,0.828242
8,-0.874920,-0.766832,0.015019,1.626733,-0.569896,1.314869,0.540981,-1.285954,-0.793940,-0.922302,0.134838,1.581404,0.983536,0.870988,-0.400759,-1.453764,-0.402239,1.099559,0.743596,-1.440916
9,1.135168,0.706697,-0.413123,-1.428742,-1.226381,-0.444814,0.164149,1.507045,0.344067,-1.711763,0.593879,0.773816,0.517655,1.210312,-0.241792,-1.486175,1.149726,0.728479,-0.501141,-1.377064


In [86]:
len(processed_data)

494

In [87]:
ones = 0 
zeros = 0
for i in range(len(labels)):
    if (labels[i] == 0 ):
        zeros+=1
    else:
        ones +=1
        
print(ones)
print(zeros)

296
198


In [88]:
processed_data['label'] = labels

In [89]:
processed_data_shuffled = processed_data.sample(frac=1)

In [90]:
processed_data_shuffled

Unnamed: 0,temperature_0,temperature_1,temperature_2,temperature_3,wind_speed_0,wind_speed_1,wind_speed_2,wind_speed_3,surface_solar_radiation_0,surface_solar_radiation_1,surface_solar_radiation_2,surface_solar_radiation_3,relative_humidity_0,relative_humidity_1,relative_humidity_2,relative_humidity_3,surface_pressure_0,surface_pressure_1,surface_pressure_2,surface_pressure_3,label
269,-1.639534,0.019950,0.808313,0.811272,-1.235871,-0.605752,0.468099,1.373525,0.410965,1.437234,-0.754473,-1.093726,-0.420027,-1.292428,0.270157,1.442298,1.379541,0.527414,-0.968715,-0.938240,1.0
229,-1.316335,0.390438,1.381147,-0.455251,-0.087975,0.935546,0.750486,-1.598057,0.085311,-1.208425,-0.413568,1.536682,-1.010388,1.578376,0.120073,-0.688060,1.404915,0.482911,-1.018298,-0.869528,1.0
338,0.311189,-1.325857,-0.395574,1.410243,-0.910509,-0.992839,0.519309,1.384038,-0.604682,-1.169902,0.299450,1.475134,0.040220,1.488594,-0.210406,-1.318408,1.562695,0.156254,-0.691881,-1.027068,1.0
286,0.993672,1.004396,-0.937121,-1.060947,-1.668808,0.993450,0.331204,0.344153,1.147725,0.201928,0.255541,-1.605195,-0.805011,1.129868,-1.168060,0.843203,1.589191,-0.523913,0.029757,-1.095035,1.0
207,-1.686060,0.192456,0.698130,0.795474,-1.643651,0.913517,0.679193,0.050941,-1.370989,-0.424924,0.493497,1.302416,-1.604653,0.484212,1.090755,0.029686,1.290975,0.547617,-0.504995,-1.333596,1.0
109,-1.553451,0.061869,0.260738,1.230844,1.427168,0.297434,-0.422075,-1.302527,-1.660927,0.119232,0.631752,0.909942,1.450154,0.387176,-0.746533,-1.090796,-1.604654,0.571432,1.048014,-0.014793,1.0
234,1.430898,0.281635,-0.405253,-1.307280,1.338369,0.120730,-1.481117,0.022018,1.471970,0.334100,-0.690035,-1.116035,0.837218,0.656216,0.189573,-1.683006,1.047945,0.855391,-1.375207,-0.528130,1.0
319,-1.646015,0.085720,0.598453,0.961842,-1.367732,-0.273741,0.226111,1.415362,-1.378241,-0.353591,0.378163,1.353669,0.861413,0.902524,-0.215754,-1.548182,1.339762,0.470693,-0.490131,-1.320324,1.0
221,-1.022854,-0.734508,0.219096,1.538267,-1.419471,-0.334533,0.467780,1.286224,-1.570707,-0.091382,0.563408,1.098681,1.460645,0.261676,-0.464393,-1.257927,-1.371210,-0.478595,0.625037,1.224768,1.0
32,1.257966,0.486103,-0.297432,-1.446637,-1.327750,-0.604327,0.914035,1.018042,1.611242,0.026876,-0.643754,-0.994363,-1.487253,0.431197,-0.194647,1.250704,1.517578,0.273868,-0.802678,-0.988768,0.0


## Test Model

### Logistic Regression, no normalization

In [91]:
y = processed_data_shuffled['label']
X = processed_data_shuffled.drop(columns=[ 'label'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0)

In [92]:
logisticRegr = LogisticRegression(max_iter=10000)
logisticRegr.fit(x_train, y_train)
predictions = logisticRegr.predict(x_test)
score = logisticRegr.score(x_test, y_test)
print("Model Accuracy: ",score)

Model Accuracy:  0.8


In [93]:
print("regression coefficients: \n", logisticRegr.coef_)

regression coefficients: 
 [[-0.05464849 -0.10043899  0.07184241  0.08324507 -0.13910814  0.16882574
  -0.00244953 -0.02726807 -0.48934405  0.10434709  0.58619384 -0.20119688
  -0.20445029  0.17614125  0.10885707 -0.08054804  0.27717856 -0.00569029
  -0.39756355  0.12607529]]


### Naive Bayes, no normalization

In [94]:
gnb = GaussianNB()
gnb.fit(x_train, y_train)
predictions = gnb.predict(x_test)
score = gnb.score(x_test, y_test)
print("Model Accuracy: ",score)

Model Accuracy:  0.8


## Helper Function: Computing Features from Ratios

In [95]:
# Takes input two column names of a dataframe and returns the ratio between then returns the ratio of them as a new figure in the dataframe inputed

def create_ratio_columns(numerator, denominator, df):
    ''' numerator, denominator are column names 
        df is the df where the columns are 
    ''' 
    # filter based on names
    f1 = df.filter(like=numerator)
    f2 = df.filter(like=denominator)
    # Init list for each column of ratio
    ratios = list()
    # Iterating through one set of column to find appropriate data
    for i, c in enumerate(f1.columns):
        col = f'{numerator}/{denominator}:{i}'
        r = f1.iloc[:, i].div(f2.iloc[:, i])
        r.name = col
        # ASS 
        ratios.append(r)  
    ratios = pd.concat(ratios, axis=1)
    
    return ratios
    

In [96]:
ratio_df = create_ratio_columns('temp', 'wind', processed_data)
ratio_df.head()

Unnamed: 0,temp/wind:0,temp/wind:1,temp/wind:2,temp/wind:3
0,0.673935,7.652709,0.208473,2.118102
1,-0.586024,-3.100023,-4.770567,0.832541
2,1.442265,1.181241,0.482647,-6.415955
3,1.128468,-1.307975,0.399818,1.002209
4,-1.107237,-0.753284,-0.737427,-1.082191


In [97]:
new = pd.concat([processed_data, ratio_df], axis=1)

In [98]:
new

Unnamed: 0,temperature_0,temperature_1,temperature_2,temperature_3,wind_speed_0,wind_speed_1,wind_speed_2,wind_speed_3,surface_solar_radiation_0,surface_solar_radiation_1,surface_solar_radiation_2,surface_solar_radiation_3,relative_humidity_0,relative_humidity_1,relative_humidity_2,relative_humidity_3,surface_pressure_0,surface_pressure_1,surface_pressure_2,surface_pressure_3,label,temp/wind:0,temp/wind:1,temp/wind:2,temp/wind:3
0,-1.076066,-0.661297,0.199521,1.537842,-1.596692,-0.086413,0.957058,0.726047,1.521186,-0.476660,-1.198011,0.153484,-1.324223,0.756978,1.153259,-0.586014,1.683832,-0.624051,-0.856771,-0.203009,0.0,0.673935,7.652709,0.208473,2.118102
1,0.939129,-0.497244,-1.398473,0.956587,-1.602543,0.160400,0.293146,1.148997,-1.405262,-0.292440,0.349540,1.348162,1.590765,-0.042888,-0.406741,-1.141136,-0.724939,1.703510,-0.272817,-0.705754,0.0,-0.586024,-3.100023,-4.770567,0.832541
2,-1.112887,-0.863835,0.812928,1.163795,-0.771625,-0.731295,1.684310,-0.181391,-0.925666,-0.620768,-0.110541,1.656976,-0.996492,0.911028,1.085176,-0.999713,-0.265811,0.057275,-1.292930,1.501465,0.0,1.442265,1.181241,0.482647,-6.415955
3,-1.691682,0.416287,0.362380,0.913015,-1.499096,-0.318268,0.906362,0.911002,-1.104986,-0.666424,0.266861,1.504550,0.648743,1.054297,-0.138281,-1.564759,-1.012213,-0.298143,-0.351803,1.662159,1.0,1.128468,-1.307975,0.399818,1.002209
4,1.235848,0.633330,-0.531690,-1.337489,-1.116156,-0.840759,0.721006,1.235909,1.009420,0.631141,-0.033828,-1.606732,0.708987,1.209636,-1.270408,-0.648215,1.054008,0.875013,-0.601978,-1.327042,0.0,-1.107237,-0.753284,-0.737427,-1.082191
5,-1.260756,-0.625114,0.595463,1.290407,-1.155826,-0.808791,1.182384,0.782233,-1.199623,-0.595534,0.352167,1.442990,0.705484,0.811097,0.162145,-1.678726,-0.004002,-0.223275,-1.287130,1.514407,0.0,1.090783,0.772899,0.503612,1.649645
6,1.531201,0.250959,-0.844212,-0.937949,-0.741676,1.389647,0.484984,-1.132955,-1.496036,-0.211382,0.493461,1.213957,1.239936,-0.773545,-1.170059,0.703668,1.123620,0.759341,-0.500965,-1.381996,0.0,-2.064514,0.180592,-1.740701,0.827878
7,-0.305406,-0.874564,-0.515840,1.695811,1.726882,-0.481264,-0.695195,-0.550424,1.024580,0.896696,-0.573002,-1.348274,-0.738983,-0.955892,0.104513,1.590362,-1.562138,-0.182797,0.916693,0.828242,0.0,-0.176854,1.817223,0.742008,-3.080919
8,-0.874920,-0.766832,0.015019,1.626733,-0.569896,1.314869,0.540981,-1.285954,-0.793940,-0.922302,0.134838,1.581404,0.983536,0.870988,-0.400759,-1.453764,-0.402239,1.099559,0.743596,-1.440916,0.0,1.535228,-0.583200,0.027762,-1.265001
9,1.135168,0.706697,-0.413123,-1.428742,-1.226381,-0.444814,0.164149,1.507045,0.344067,-1.711763,0.593879,0.773816,0.517655,1.210312,-0.241792,-1.486175,1.149726,0.728479,-0.501141,-1.377064,0.0,-0.925624,-1.588749,-2.516749,-0.948042


In [99]:
## Plot data
## Take any 2 columns and plot how they relate to eachother in a line 
def create_line(df_column1, df_column2):
    ''' We want to be able to visualize the behavior to explore the features more 
    '''
    
    # Find how windspeed and humidity grow together with time. 

In [100]:
## Preparing Model

 ## Find 10 Non Tornadoes 

In [101]:
reduced_dataset = processed_data_shuffled[processed_data_shuffled['label']== 0]

In [102]:
y = reduced_dataset['label']
X = reduced_dataset.drop(columns=[ 'label'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0)

In [103]:
predictions = logisticRegr.predict(x_test)

In [104]:
predictions

array([1., 0., 0., 1., 1., 0., 0., 0., 1., 1.])

## Drawing Board

In [105]:
dfb = pd.DataFrame()

In [41]:
type(dfb)

pandas.core.frame.DataFrame

In [42]:
dfb['b'] = [0, 1, 2, 3, 4]

In [43]:
dfb['c'] = [0, 1, 2, 3, 4]

In [44]:
dfb

Unnamed: 0,b,c
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4


In [45]:
dfb.expanding().mean()

Unnamed: 0,b,c
0,0.0,0.0
1,0.5,0.5
2,1.0,1.0
3,1.5,1.5
4,2.0,2.0


In [76]:
x = [0, 1, 2, 3, 4]

In [78]:
(np.average(x))

2.0