In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib

from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [41]:
NUMBER_TORNADOS = 111 #Number of Tornado Events
NUMBER_RANDOM = 110 #Number of Random Events
NUMBER_DAYS = 14 #Number of days of data per Tornado/Event (Max: 57)
FILE_NAME = "historical_data_2.csv" #CSV file that contains the data
NUMBER_OF_ROWS = 296

In [42]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', NUMBER_OF_ROWS)
pd.set_option('display.min_rows', 20)

In [3]:
#Field 0: 'datatime'
#Field 1: 'temperature'
#Field 2: 'windspeed'
#Field 3: 'surface solar radiation' -Alvaro
#Field 4: 'relative humidity' -Abdullah
#Field 5: 'surface pressure' -Frazier
#Field 6: 'total precipitation' -Simon
#Field 7: 'city'
#Field 8: 'event_id'
#Field 9: 'latitude'
#Field 10: 'longitude'
#Field 11: 'outcome'

df = pd.read_csv(FILE_NAME)
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [4]:
df 

Unnamed: 0,datetime,temperature (degC),wind_speed (m/s),surface_solar_radiation (W/m^2),relative_humidity (0-1),surface_pressure (Pa),total_precipitation (mm of water equivalent),latitude,longitude,event_id
0,2009-11-25,9.91,2.76,160,0.47,101257.58,0.00,32.4869,-94.1689,203448
1,2009-11-26,9.35,2.11,145,0.60,101181.89,0.00,32.4869,-94.1689,203448
2,2009-11-27,13.11,3.30,111,0.63,100649.88,0.00,32.4869,-94.1689,203448
3,2009-11-28,16.85,3.27,71,0.83,100267.40,0.26,32.4869,-94.1689,203448
4,2009-11-29,12.53,3.04,98,0.79,100664.11,0.92,32.4869,-94.1689,203448
...,...,...,...,...,...,...,...,...,...,...
16867,2017-02-24,4.32,5.83,187,0.49,99717.88,0.00,37.9392,-89.1503,677492
16868,2017-02-25,2.52,1.58,155,0.46,100450.42,0.00,37.9392,-89.1503,677492
16869,2017-02-26,9.50,2.33,183,0.61,100341.69,0.04,37.9392,-89.1503,677492
16870,2017-02-27,15.56,4.96,54,0.77,99730.39,0.01,37.9392,-89.1503,677492


In [5]:
#df = data.drop(columns=['Unnamed: 0'])
# Get a seto of the event Id's
event_ids = set(df['event_id'].to_numpy())

In [6]:
## Produces a list of dfs
# Each df has the rolling means of the fundamental features
list_of_dfs = []
#Use IDs to loop over events 
for i in event_ids:
    # We initialize a df per event
    event_df = pd.DataFrame()
    # Get data for that event 
    event = df.loc[df['event_id'] == i]
    # Save the dates
    temp = event['datetime']
    # We only want the features we are processing
    fundamental_features = event.drop(columns=['datetime', 'latitude', 'longitude', 'event_id'	], axis = 1)
    # Compute the rolling mean
    event_df = fundamental_features.expanding().mean() 
    # Put the date
    event_df['Date'] = temp 
    # We want 3 weeks of data
    event_df = event_df.tail(21)
    # Add to list
    list_of_dfs.append(event_df)

In [7]:
len(list_of_dfs)
len(list_of_dfs[0])

21

In [8]:
list_of_dfs[0]

Unnamed: 0,temperature (degC),wind_speed (m/s),surface_solar_radiation (W/m^2),relative_humidity (0-1),surface_pressure (Pa),total_precipitation (mm of water equivalent),Date
16566,15.615946,3.614595,96.783784,0.821351,102196.950811,0.167027,2017-01-18
16567,15.732105,3.584737,96.842105,0.823684,102163.660789,0.164737,2017-01-19
16568,15.856923,3.569231,96.102564,0.825385,102122.925385,0.197436,2017-01-20
16569,15.94225,3.595,97.225,0.8235,102066.2055,0.19875,2017-01-21
16570,15.934634,3.692439,99.243902,0.818537,102029.591707,0.19439,2017-01-22
16571,15.898571,3.608333,101.214286,0.815952,102011.347143,0.189762,2017-01-23
16572,15.94814,3.616744,101.790698,0.815581,101990.16814,0.185581,2017-01-24
16573,15.949773,3.631591,103.0,0.810455,101981.88,0.182273,2017-01-25
16574,15.820889,3.652889,104.644444,0.803556,101989.729556,0.178222,2017-01-26
16575,15.696087,3.633478,104.326087,0.796304,101997.687174,0.174348,2017-01-27


In [24]:
def generateRollingAvgFeatures(df):
    ''' Turns a whole data frame into a line of rolling average features
    '''
    all_lists = []
    #Iterate over ever column
    for column in df.columns[:-1]:
        # Processed list
        # Turn the column of data into a list
        processing_list = df[str(column)].tolist()
        #Loop over the list 
        # Initialize a list to store the 7 processed values 
        processed_list = []
        for i in range(len(processing_list)):
            # Gets the right spots for computing
            if (((i + 1) % 3) == 0):
                feature_point = (processing_list[i] + processing_list[i -1] + processing_list[i-2]) / 3
                processed_list.append(feature_point)
                # TODO: Should the data be normalized within its own sample at this point ?
        # So we end up w 1 list
        all_lists += processed_list
    return all_lists 
        # TODO: We finished processing the data , we need to turn it into a row of features


In [30]:
# Loop over each Df
processing_data = list_of_dfs.copy()
flag = 0
df = pd.DataFrame()
list_of_lists = []
for df in processing_data:
    if (1):
        # This will return a row of features for each event 
        
        # TODO:Should this return a list that then becomes a row in the dataframe?
        current = generateRollingAvgFeatures(df)
        list_of_lists.append(current)
        flag +=1
        # We need to add this df to the 


# Pass the DF
# For each DF run computations and generate columns -> Features

In [31]:
len(current)

42

In [32]:
len(list_of_lists)

296

In [37]:
nums = range(7)
cols = [f'{v}_{i}' for v in df.columns[:-1] for i in nums]

In [38]:
processed_data = pd.DataFrame(list_of_lists, columns=cols)

In [44]:
processed_data.head(20)

Unnamed: 0,temperature (degC)_0,temperature (degC)_1,temperature (degC)_2,temperature (degC)_3,temperature (degC)_4,temperature (degC)_5,temperature (degC)_6,wind_speed (m/s)_0,wind_speed (m/s)_1,wind_speed (m/s)_2,wind_speed (m/s)_3,wind_speed (m/s)_4,wind_speed (m/s)_5,wind_speed (m/s)_6,surface_solar_radiation (W/m^2)_0,surface_solar_radiation (W/m^2)_1,surface_solar_radiation (W/m^2)_2,surface_solar_radiation (W/m^2)_3,surface_solar_radiation (W/m^2)_4,surface_solar_radiation (W/m^2)_5,surface_solar_radiation (W/m^2)_6,relative_humidity (0-1)_0,relative_humidity (0-1)_1,relative_humidity (0-1)_2,relative_humidity (0-1)_3,relative_humidity (0-1)_4,relative_humidity (0-1)_5,relative_humidity (0-1)_6,surface_pressure (Pa)_0,surface_pressure (Pa)_1,surface_pressure (Pa)_2,surface_pressure (Pa)_3,surface_pressure (Pa)_4,surface_pressure (Pa)_5,surface_pressure (Pa)_6,total_precipitation (mm of water equivalent)_0,total_precipitation (mm of water equivalent)_1,total_precipitation (mm of water equivalent)_2,total_precipitation (mm of water equivalent)_3,total_precipitation (mm of water equivalent)_4,total_precipitation (mm of water equivalent)_5,total_precipitation (mm of water equivalent)_6
0,15.734991,15.925152,15.906267,15.619788,15.590671,15.55653,15.648776,3.589521,3.631924,3.633741,3.649677,3.657262,3.594018,3.602049,96.576151,99.227729,103.145047,106.165581,110.27101,111.475609,111.630639,0.823473,0.81933,0.809864,0.790085,0.781539,0.779778,0.781052,102161.178995,102035.714783,101987.259232,102000.960726,102007.63994,102024.205581,102014.413769,0.1764,0.194301,0.182025,0.17069,0.160508,0.151734,0.150859
1,13.043718,13.463723,13.452982,13.562694,13.87771,14.252325,14.716715,2.357845,2.429818,2.504461,2.504743,2.469264,2.420871,2.489205,195.431089,196.793312,199.692201,203.536604,206.464514,211.033225,211.188598,0.673813,0.674275,0.672916,0.671291,0.671784,0.67039,0.673299,100112.874911,100122.933999,100122.178084,100150.548712,100154.902216,100132.962668,100094.190902,0.206674,0.201704,0.217802,0.20467,0.195884,0.187955,0.239943
2,8.224683,8.85313,9.08005,9.079007,9.459363,9.991274,10.371327,4.213247,4.287833,4.452769,4.547049,4.492278,4.403594,4.34753,231.441362,234.55692,229.064748,228.513054,233.950903,238.953359,240.031649,0.49407,0.489679,0.498487,0.502282,0.502122,0.502252,0.503701,94965.521452,94905.019305,94803.055335,94810.830151,94799.291695,94752.503554,94728.406602,0.047753,0.065085,0.071019,0.069947,0.066018,0.067264,0.099882
3,8.205205,8.831636,9.092504,9.093781,9.467477,10.024016,10.420423,4.258746,4.325136,4.447883,4.543053,4.499616,4.427516,4.374984,230.089841,232.97802,227.324998,226.633962,232.050663,237.029328,237.862279,0.491873,0.487235,0.495983,0.501075,0.500856,0.500494,0.501263,95277.507298,95217.433839,95113.965428,95119.206265,95108.502253,95061.360541,95037.029752,0.046732,0.062613,0.072147,0.071719,0.067818,0.067093,0.103647
4,8.318497,8.945491,9.241553,9.242445,9.616028,10.215082,10.636644,4.296162,4.359084,4.465864,4.555786,4.521349,4.480145,4.422796,228.338537,230.845461,225.509122,224.705057,230.143489,235.199184,235.98857,0.487919,0.483162,0.49166,0.497953,0.497658,0.496097,0.496145,95712.874243,95653.277991,95548.743597,95551.05314,95540.373323,95492.342939,95467.497595,0.046625,0.058124,0.07084,0.071434,0.067618,0.064661,0.11163
5,15.234227,15.725689,15.905757,15.993587,16.382112,16.778862,17.159001,2.886413,2.952579,2.936007,2.977736,3.007189,3.060642,3.169556,197.428779,200.76235,204.911886,208.230143,206.892349,210.016364,210.233988,0.704822,0.703289,0.699354,0.693826,0.698561,0.699695,0.698763,100737.058151,100719.363114,100693.669858,100671.449086,100660.816654,100655.392116,100604.946872,0.197282,0.190153,0.187672,0.180833,0.177451,0.170418,0.179333
6,13.885902,14.255321,14.253469,14.328641,14.591385,14.966385,15.458381,2.281571,2.337522,2.415926,2.440192,2.410721,2.362645,2.43133,194.196648,196.553097,200.020633,203.408148,206.139069,210.199739,210.815446,0.683019,0.683625,0.682611,0.681857,0.683513,0.68272,0.683742,100579.472956,100593.253581,100590.524373,100615.917085,100620.327987,100598.43345,100562.38748,0.206938,0.195202,0.197341,0.189768,0.180429,0.171359,0.164999
7,8.319908,8.946878,9.244729,9.245847,9.619294,10.220291,10.642771,4.295381,4.357784,4.464131,4.553799,4.519945,4.48014,4.423032,228.207359,230.683188,225.395791,224.577359,230.023457,235.073487,235.845895,0.488183,0.483406,0.491961,0.498379,0.498058,0.49635,0.496324,95733.584242,95674.024039,95569.450679,95571.58078,95560.944377,95512.896182,95488.032324,0.046539,0.05788,0.070762,0.071647,0.067818,0.06485,0.112334
8,15.297785,15.799603,15.941389,15.995497,16.358627,16.726287,17.089253,2.898456,2.953396,2.93684,2.940115,2.967491,3.003759,3.105612,203.390506,207.602662,211.226568,214.310663,212.26139,213.777545,213.10168,0.678067,0.676772,0.674481,0.671618,0.677821,0.681754,0.685914,100308.233394,100292.213948,100267.22505,100247.139867,100241.808045,100241.628411,100196.859396,0.257224,0.244915,0.243867,0.235455,0.237241,0.231437,0.231948
9,12.681554,12.417041,12.522798,12.764305,13.11511,13.458239,13.695519,3.227399,3.194605,3.245685,3.266217,3.222147,3.265694,3.279892,179.826306,176.593854,182.26429,187.876773,191.67045,193.454725,195.583178,0.549146,0.561392,0.553507,0.545846,0.550884,0.553285,0.552727,101128.825176,101063.492135,101018.910847,100998.048535,100994.568667,100964.081362,100950.104401,0.110426,0.121069,0.116209,0.112374,0.115794,0.114367,0.120038


##Helper Function

In [83]:
# Takes input two column names of a dataframe and returns the ratio between then returns the ratio of them as a new figure in the dataframe inputed

def create_ratio_columns(numerator, denominator, df):
    ''' numerator, denominator are column names 
        df is the df where the columns are 
    ''' 
    # filter based on names
    f1 = df.filter(like=numerator)
    f2 = df.filter(like=denominator)
    # Init list for each column of ratio
    ratios = list()
    # Iterating through one set of column to find appropriate data
    for i, c in enumerate(f1.columns):
        col = f'{numerator}/{denominator}:{i}'
        r = f1.iloc[:, i].div(f2.iloc[:, i])
        r.name = col
        # ASS 
        ratios.append(r)
        
    
    ratios = pd.concat(ratios, axis=1)
    
    return ratios
    

In [85]:
ratio_df = create_ratio_columns('temp', 'wind', processed_data)
ratio_df.head()

Unnamed: 0,temp/wind:0,temp/wind:1,temp/wind:2,temp/wind:3,temp/wind:4,temp/wind:5,temp/wind:6
0,4.383591,4.38477,4.37738,4.279772,4.262936,4.328451,4.34441
1,5.532051,5.541042,5.371608,5.414803,5.620181,5.887272,5.912216
2,1.952101,2.06471,2.039192,1.996681,2.105694,2.268891,2.385568
3,1.926672,2.041932,2.044232,2.001689,2.104063,2.264027,2.38182
4,1.936262,2.05215,2.069376,2.028727,2.126805,2.280078,2.404959


In [90]:
new = pd.concat([processed_data, ratio_df], axis=1)

In [None]:
new

In [None]:
## Plot data
## Take any 2 columns and plot how they relate to eachother in a line 
def create_line(df_column1, df_column2):
    ''' We want to be able to visualize the behavior to explore the features more 
    '''
    
    # Find how windspeed and humidity grow together with time. 

In [None]:
test1

## Drawing Board

In [51]:
dfb = pd.DataFrame()

In [52]:
type(dfb)

pandas.core.frame.DataFrame

In [53]:
dfb['b'] = [0, 1, 2, 3, 4]

In [54]:
dfb['c'] = [0, 1, 2, 3, 4]

In [55]:
dfb

Unnamed: 0,b,c
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4


In [56]:
dfb.expanding().mean()

Unnamed: 0,b,c
0,0.0,0.0
1,0.5,0.5
2,1.0,1.0
3,1.5,1.5
4,2.0,2.0
