In [43]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib

from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [44]:

FILE_NAME = "all_new_data.csv" #CSV file that contains the data
NUMBER_OF_ROWS = 296

In [45]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', NUMBER_OF_ROWS)
pd.set_option('display.min_rows', 20)

In [46]:
df = pd.read_csv(FILE_NAME)

In [47]:
df.columns

Index(['Unnamed: 0', 'datetime', 'temperature', 'wind_speed',
       'surface_solar_radiation', 'relative_humidity', 'surface_pressure',
       'total_precipitation', 'latitude', 'longitude', 'event_id', 'outcome',
       'city'],
      dtype='object')

In [48]:
#df = data.drop(columns=['Unnamed: 0'])
# Get a seto of the event Id's
event_ids = set(df['event_id'].to_numpy())

In [49]:
## Produces a list of dfs
# Each df has the rolling means of the fundamental features
labels = []
list_of_dfs = []
#Use IDs to loop over events 
for i in event_ids:
    # We initialize a df per event
    event_df = pd.DataFrame()
    # Get data for that event 
    event = df.loc[df['event_id'] == i]
    # Exract the label 
    label = event['outcome'].iloc[0]
    # Record the label of the df: 0 or 1
    labels.append(label)
    # Save the dates
    temp = event['datetime']
    # We only want the features we are processing
    fundamental_features = event.drop(columns=[ 'latitude', 'longitude', 'event_id','outcome', 'Unnamed: 0'], axis = 1)
    # Compute the rolling mean
    event_df = fundamental_features.expanding().mean()  
    # Put the date
    event_df['Date'] = temp 
    # We want 3 weeks of the last 3 weeks of data
    event_df = event_df.tail(9)
    # We want the last 14 days of this
#     event_df = event_df.head(14)
    # Add to list
    list_of_dfs.append(event_df)

In [50]:
len(list_of_dfs[0])

9

In [51]:
list_of_dfs[0]

Unnamed: 0,temperature,wind_speed,surface_solar_radiation,relative_humidity,surface_pressure,total_precipitation,Date
16920,14.055714,3.785102,194.387755,0.748163,100246.197551,0.097959,2017-06-17
16921,14.2236,3.7282,196.78,0.745,100263.057,0.096,2017-06-18
16922,14.414706,3.683725,198.313725,0.742941,100271.253922,0.094118,2017-06-19
16923,14.550962,3.676346,199.384615,0.742885,100277.904615,0.0925,2017-06-20
16924,14.696038,3.648302,201.037736,0.742642,100276.177358,0.090755,2017-06-21
16925,14.763148,3.652222,200.518519,0.742963,100269.123333,0.089815,2017-06-22
16926,14.789273,3.694545,198.854545,0.743818,100267.079636,0.089091,2017-06-23
16927,14.840179,3.716786,197.857143,0.744107,100261.642143,0.088036,2017-06-24
16928,14.855789,3.715965,196.368421,0.744386,100255.44614,0.086842,2017-06-25


In [120]:
def generateRollingAvgFeatures(df):
    ''' Turns a whole data frame into a line of rolling average features
    '''
    all_lists = []
    #Iterate over ever column
    for column in df.columns[:-1]:
        # Processed list
        # Turn the column of data into a list
        processing_list = df[str(column)].tolist()
        #Loop over the list 
        # Initialize a list to store the 7 processed values 
        processed_list = []
        for i in range(len(processing_list)):
            # Gets the right spots for computing
            if (((i + 1) % 3) == 0):
                feature_point = (processing_list[i] + processing_list[i -1] + processing_list[i-2]) / 3
                processed_list.append(feature_point)
                
        # Here we normalize the list of siumilar data to set up a pattern
        avg = np.average(processed_list)
        std = np.std(processed_list)
        norm = [(float(i) - avg) / std for i in processed_list]
#         norm = [float(i)/sum(processed_list) for i in processed_list]
        all_lists += norm
    return all_lists 

In [121]:
# Loop over each Df
processing_data = list_of_dfs.copy()
flag = 0
df = pd.DataFrame()
list_of_lists = []
for df in processing_data:
    if (1):
        #label = getEventType()
        # This will return a row of features for each event 
        x = df
        current = generateRollingAvgFeatures(df)
        list_of_lists.append(current)
        flag +=1
        # We need to add this df to the 

# Pass the DF
# For each DF run computations and generate columns -> Features

In [122]:
nums = range(3)
cols = [f'{v}_{i}' for v in df.columns[:-1] for i in nums]

In [123]:
processed_data = pd.DataFrame(list_of_lists, columns=cols)
# Replace NAN's by 0
processed_data.fillna(0)

Unnamed: 0,temperature_0,temperature_1,temperature_2,wind_speed_0,wind_speed_1,wind_speed_2,surface_solar_radiation_0,surface_solar_radiation_1,surface_solar_radiation_2,relative_humidity_0,relative_humidity_1,relative_humidity_2,surface_pressure_0,surface_pressure_1,surface_pressure_2,total_precipitation_0,total_precipitation_1,total_precipitation_2
0,-1.367089,0.370016,0.997073,1.051816,-1.344603,0.292787,-1.048991,1.345905,-0.296914,1.223185,-1.226299,0.003115,-0.799683,1.409980,-0.610298,1.311708,-0.198070,-1.113638
1,-1.105687,-0.210761,1.316448,-1.368047,0.373643,0.994404,-1.392671,0.483377,0.909293,1.325134,-1.090369,-0.234765,1.234862,-0.020492,-1.214371,-0.297664,-1.048476,1.346140
2,-1.244064,0.039599,1.204465,1.378693,-0.962119,-0.416574,-1.182636,-0.080269,1.262905,1.202161,0.043984,-1.246144,-0.994070,-0.374095,1.368166,1.234288,-0.019315,-1.214973
3,-1.178814,-0.087201,1.266015,0.330547,-1.356094,1.025547,-1.413621,0.671357,0.742264,-0.033934,-1.207425,1.241359,1.097585,0.223528,-1.321114,-0.526624,-0.873350,1.399974
4,1.236914,-0.024713,-1.212201,1.366030,-0.999988,-0.366042,1.056613,0.285743,-1.342356,0.653310,-1.412882,0.759572,-1.166059,-0.109957,1.276016,0.741103,-1.413660,0.672556
5,-0.329585,-1.026228,1.355813,-0.196449,-1.114646,1.311096,-1.413589,0.670400,0.743189,1.413233,-0.752219,-0.661014,-1.128636,1.302313,-0.173677,-0.622561,-0.788408,1.410968
6,-1.269943,0.096052,1.173891,0.842112,0.562883,-1.404995,-1.375173,0.973374,0.401799,1.379586,-0.959156,-0.420431,0.808395,-1.409122,0.600727,-0.351073,-1.010870,1.361943
7,1.414200,-0.712532,-0.701668,1.054304,0.289140,-1.343444,1.145078,0.146208,-1.291286,-1.156558,-0.126547,1.283105,-1.214290,-0.020649,1.234939,1.114420,0.196813,-1.311233
8,-0.988009,-0.382282,1.370291,-1.330136,0.249072,1.081065,0.407532,0.969025,-1.376557,-1.126827,-0.176651,1.303478,1.056295,0.286211,-1.342506,-1.375006,0.401113,0.973893
9,1.238489,-0.027968,-1.210521,-0.541797,1.402200,-0.860402,1.191718,0.063577,-1.255295,0.278505,1.061508,-1.340013,-1.231062,0.012733,1.218329,1.219074,0.011265,-1.230338


In [124]:
processed_data['label'] = labels

In [125]:
processed_data_shuffled = processed_data.sample(frac=1)

In [126]:
y = processed_data_shuffled['label']
X = processed_data_shuffled.drop(columns=[ 'label'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0)

In [127]:
logisticRegr = LogisticRegression(max_iter=10000)
logisticRegr.fit(x_train, y_train)
predictions = logisticRegr.predict(x_test)
score = logisticRegr.score(x_test, y_test)
print("Model Accuracy: ",score)

Model Accuracy:  0.96


In [128]:
gnb = GaussianNB()
gnb.fit(x_train, y_train)
predictions = gnb.predict(x_test)
score = gnb.score(x_test, y_test)
print("Model Accuracy: ",score)

Model Accuracy:  0.92


In [129]:
reduced_dataset = processed_data_shuffled[processed_data_shuffled['label']== 0]

In [130]:
y = reduced_dataset['label']
X = reduced_dataset.drop(columns=[ 'label'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0)

In [131]:
predictions = logisticRegr.predict(x_test)

In [132]:
predictions

array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0.])