In [None]:
print("This is the test for initial commit!")

In [None]:
import pandas as pd
print("Pandas imported successfully, version: "+pd.__version__)
import statsmodels as sm
print("Statsmodels imported successfully, version: "+sm.__version__)

#Plotly packages
import plotly.graph_objects as go
print("Plotly function imported succesfully")
import plotly.express as px
print("Plotly express imported succesfully")
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly as py
import plotly.figure_factory as ff

#Matplotlib packages
import matplotlib.pyplot as plt

#Numpy + Statistics
import numpy as np
print("Numpy imported successfully, version: "+np.__version__)

from statsmodels.graphics.gofplots import qqplot

#Define needed function in the code
def namestr(obj, namespace):
    return [name for name in namespace if namespace[name] is obj]

In [None]:
#Import dataset

data_all = pd.read_csv('/Users/mazutislab/Desktop/SynBio/US_Accidents_May19.csv')

In [None]:
#Selecting only needed data in the given dataset
data_cleaned = data_all[["Severity", "Temperature(F)", "Humidity(%)", "Pressure(in)", "Visibility(mi)", "Wind_Speed(mph)", 
                        "Precipitation(in)"]]
data_cleaned.reset_index()
type(data_cleaned)

#Convert Temperature in Fahrenheit to Celsius
def fahr_to_celsius(temp_fahr):
    """Convert Fahrenheit to Celsius and Return Celsius conversion of input"""
    temp_celsius = (temp_fahr - 32) * 5 / 9
    return temp_celsius

data_cleaned["Temperature(C)"] = (fahr_to_celsius(data_cleaned["Temperature(F)"])).round(2)
data_cleaned.drop(['Temperature(F)'], inplace = True, axis = 1)

#Convert Pressure in inches of mercury to mbar
data_cleaned["Pressure(mbar)"] = data_cleaned["Pressure(in)"]*0.033863886666667*1000
data_cleaned.drop(['Pressure(in)'], inplace = True, axis = 1)

#Convert Wind Speed in mph to kmh
data_cleaned["Wind_Speed(kmh)"] = data_cleaned["Wind_Speed(mph)"]*1.609344
data_cleaned.drop(['Wind_Speed(mph)'], inplace = True, axis = 1)

#Convert Precipitation in inches to mm
data_cleaned["Precipitation(mm)"] = data_cleaned["Precipitation(in)"]*25.4
data_cleaned.drop(['Precipitation(in)'], inplace = True, axis = 1)

#Convert Visibility in miles to km
data_cleaned["Visibility(km)"] = data_cleaned["Visibility(mi)"]*1.609344
data_cleaned.drop(['Visibility(mi)'], inplace = True, axis = 1)
data_cleaned.head()



data_cleaned = data_cleaned[data_cleaned['Severity'] >= 1]
data_cleaned = data_cleaned.sort_values(by=['Severity'], ascending = True)
data_cleaned.reset_index()
data_cleaned.head()

t_col = 'Temperature(C)'
h_col = 'Humidity(%)'

In [None]:
#Cleaning data_set for PCA analysis

data_cleaned_final = data_cleaned[data_cleaned['Temperature(C)'].between(-55, 50, inclusive = True)]

data_cleaned_final = data_cleaned_final[data_cleaned_final['Pressure(mbar)'].between(950, 1055, inclusive = True)]

data_cleaned_final = data_cleaned_final[data_cleaned_final['Wind_Speed(kmh)'].between(0, 150, inclusive = True)]

data_cleaned_final = data_cleaned_final[data_cleaned_final['Visibility(km)'].between(0, 80, inclusive = True)]

print(data_cleaned_final)

In [None]:
from sklearn.preprocessing import StandardScaler
print('Succesful')

data = data_cleaned_final.drop(columns=['Precipitation(mm)'])
data = data.dropna()

print(data)

features = ['Humidity(%)', 'Temperature(C)', 'Pressure(mbar)', 'Wind_Speed(kmh)', 'Visibility(km)']
index = data.index
print(index)
    

# Separating out the features
x = data.loc[:, features].values
print(x)
# Separating out the target
y = data.loc[:,['Severity']].values
print(y)
# Standardizing the features
x = StandardScaler().fit_transform(x)
print(x)

In [None]:
print(data.loc[:,['Severity']])

In [None]:
#PCA

from sklearn.decomposition import PCA
pca = PCA(n_components=4)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 
                          'principal component 2', 'principal component 3', 'principal component 4'],)
                           
principalDf = principalDf.set_index(index)

print(principalDf)
print(pca.explained_variance_ratio_)


In [None]:
#Graph - Variance explained by principal components

pca_x = ['Principal Component 1', 'Principal Component 2', 'Principal Component 3', 'Principal Component 4']

fig = go.Figure([go.Bar(x=pca_x, y=pca.explained_variance_ratio_)])

fig.update_layout(
    title="Variance explained by principal components",
    xaxis_title="Principal Components",
    yaxis_title="Variance explained")

fig.show()

In [None]:
finalDf = pd.concat([principalDf, data.loc[:,['Severity']]], axis = 1)

In [None]:
#PCA with 3 principal components

from mpl_toolkits import mplot3d


fig = plt.figure(figsize = (8,8))
ax = plt.axes(projection='3d')
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_zlabel('Principal Component 3', fontsize = 15)
ax.set_title('3 component PCA', fontsize = 20)
targets = [1, 2, 3, 4]
colors = ['r', 'g', 'b', 'y']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['Severity'] == target
    ax.scatter3D(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , finalDf.loc[indicesToKeep, 'principal component 3']
               , c = color
               , s = 50, alpha = 0.3)
ax.legend(targets)
ax.grid()

In [None]:
#PCA with 2 principal components

fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = [1, 2, 3, 4]
colors = ['r', 'g', 'b', 'y']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['Severity'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50, alpha = 0.3)
ax.legend(targets)
ax.grid()