In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.decomposition import PCA
import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score

In [5]:
data = pd.read_csv('la_data_full.csv')

In [6]:
display(data.head(3))
print('Shape of original data: \n', data.shape)
print('\n')

data.info()
print('\n')

print('Number of columns of each data type: \n', data.dtypes.value_counts())
print('\n')

isnull_series = data.isnull().sum()[data.isnull().sum() > 0]
print('Columns with null values: \n', isnull_series.sort_values(ascending=False))

Unnamed: 0,Start_Time,Start_Lat,Start_Lng,Temperature(F),Humidity(%),Pressure(in),Wind_Speed(mph),Precipitation(in),Amenity,Bump,...,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Target
0,2016-03-22 19:36:44,34.09256,-118.20622,64.0,24.0,30.0,9.2,,False,False,...,False,False,False,False,False,True,False,False,False,1
1,2016-03-22 20:59:43,33.94819,-118.27973,64.0,23.0,30.02,11.5,,False,False,...,False,False,False,False,False,False,False,False,False,1
2,2016-03-23 07:59:47,34.0233,-118.17288,55.9,53.0,30.11,,,False,False,...,True,False,False,False,False,False,False,False,False,1


Shape of original data: 
 (275824, 22)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275824 entries, 0 to 275823
Data columns (total 22 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Start_Time         275824 non-null  object 
 1   Start_Lat          275824 non-null  float64
 2   Start_Lng          275824 non-null  float64
 3   Temperature(F)     275280 non-null  float64
 4   Humidity(%)        275270 non-null  float64
 5   Pressure(in)       275281 non-null  float64
 6   Wind_Speed(mph)    270198 non-null  float64
 7   Precipitation(in)  264554 non-null  float64
 8   Amenity            275824 non-null  bool   
 9   Bump               275824 non-null  bool   
 10  Crossing           275824 non-null  bool   
 11  Give_Way           275824 non-null  bool   
 12  Junction           275824 non-null  bool   
 13  No_Exit            275824 non-null  bool   
 14  Railway            275824 non-null  bool   
 15  Roundabout

In [7]:
# Extract n-th week, day of week, and hour from 'Start_Time' column
print(data.shape)
data.dropna(subset=['Start_Time'], inplace=True) # drop rows that have null start_time
data['Start_Time'] = pd.to_datetime(data['Start_Time'])
print(data.shape)

data['Hour'] = data['Start_Time'].dt.hour # extract hours from weather_timestamp to reflect how the hour of the day impacts 
data['Hour'] = data['Hour'].astype(int)

data['Day'] = data['Start_Time'].dt.dayofweek # extract day of week to reflect how the day of week impacts 

data['Num_week'] = data['Start_Time'].dt.isocalendar().week # extract n-th week of the year from to reflect seasonal impacts 

data.drop('Start_Time', axis=1, inplace=True) 

data.head()

(275824, 22)
(275824, 22)


Unnamed: 0,Start_Lat,Start_Lng,Temperature(F),Humidity(%),Pressure(in),Wind_Speed(mph),Precipitation(in),Amenity,Bump,Crossing,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Target,Hour,Day,Num_week
0,34.09256,-118.20622,64.0,24.0,30.0,9.2,,False,False,False,...,False,False,True,False,False,False,1,19,1,12
1,33.94819,-118.27973,64.0,23.0,30.02,11.5,,False,False,False,...,False,False,False,False,False,False,1,20,1,12
2,34.0233,-118.17288,55.9,53.0,30.11,,,False,False,False,...,False,False,False,False,False,False,1,7,2,12
3,34.1447,-118.27865,73.9,14.0,30.15,,,False,False,False,...,False,False,False,False,False,False,1,11,2,12
4,34.09914,-118.251853,73.9,14.0,30.15,,,False,False,False,...,False,False,False,False,False,False,1,12,2,12


In [8]:
# Imputating null values
data.fillna(data.mean(), inplace=True) # imputating with mean 

In [9]:
data['Hour'] = data['Hour'].astype(object)
data['Day'] = data['Day'].astype(object)
data['Num_week'] = data['Num_week'].astype(object)

print('Number of columns of each data type: \n', data.dtypes.value_counts())
print('\n')

isnull_series = data.isnull().sum()[data.isnull().sum() > 0]
print('Columns with null values: \n', isnull_series.sort_values(ascending=False))

Number of columns of each data type: 
 bool       13
float64     7
object      3
int64       1
dtype: int64


Columns with null values: 
 Series([], dtype: int64)


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 275824 entries, 0 to 275823
Data columns (total 24 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Start_Lat          275824 non-null  float64
 1   Start_Lng          275824 non-null  float64
 2   Temperature(F)     275824 non-null  float64
 3   Humidity(%)        275824 non-null  float64
 4   Pressure(in)       275824 non-null  float64
 5   Wind_Speed(mph)    275824 non-null  float64
 6   Precipitation(in)  275824 non-null  float64
 7   Amenity            275824 non-null  bool   
 8   Bump               275824 non-null  bool   
 9   Crossing           275824 non-null  bool   
 10  Give_Way           275824 non-null  bool   
 11  Junction           275824 non-null  bool   
 12  No_Exit            275824 non-null  bool   
 13  Railway            275824 non-null  bool   
 14  Roundabout         275824 non-null  bool   
 15  Station            275824 non-null  bool   
 16  St

In [13]:
y_target = data['Target'] # target classification label
X_features = data.drop('Target', axis=1, inplace=False) # input data
print("y_target shape: ", y_target.shape)
print("X_features shape: ", X_features.shape)

X_features_ohe = pd.get_dummies(X_features) # X_features after one-hot-encoding
print("X_features_ohe shape: ", X_features_ohe.shape)

y_target shape:  (275824,)
X_features shape:  (275824, 23)
X_features_ohe shape:  (275824, 104)
