In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import string
import random

### Creating the Dataframe

In [2]:
col_names = ['Accident ID','State','Severity','Weather Cond.',"Distraction to Driver",'Sunrise Sunset']
df = pd.DataFrame(index=np.arange(1,400) ,columns = col_names)

### Creating values for every attribute of the Dataframe

In [3]:
# Creating a function that produces unique accident ID's
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
    return "".join(random.choice(chars) for _ in range(size))

# Creating a function that produces random integers values about distraction of the driver to fill the corresponding column
def distraction_generator(x,y):
    return random.randint(x,y)

# Creating a function that collects random values from "sunrise_values" list and fill the column when some conditions occur
def sunrise_generator(x,y,z):
    return np.random.choice(x,y,z)

state_names = ["Alaska", "Alabama", "Arkansas", "American Samoa", "Arizona", "California", "Colorado", "Connecticut",
               "District ", "of Columbia", "Delaware", "Florida", "Georgia", "Guam", "Hawaii", "Iowa", "Idaho", "Illinois",
               "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota",
               "Missouri", "Mississippi", "Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire", "New Jersey",
               "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Puerto Rico", "Rhode Island", 
               "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Virginia", "Virgin Islands", "Vermont",
               "Washington", "Wisconsin", "West Virginia", "Wyoming"]

states_with_big_population = ["California", "Texas", "Florida", "New York"]

# The severity of accidents' values go from 3 to 1 with (3=serious accidents, 2=medium significance accidents, 1=smash)
severity_values = [1,2,3]
weather_values = ["Dry","Rainy","Gale","Foggy"]
distraction_values = [0, 1, 2]
sunrise_values = ["Day","Evening","Night"]

### Filling the Dataframe with random and conditional values

In [4]:
rows = df.shape[0]
df["Accident ID"] = [id_generator(10) for _ in range(rows)]
df["State"] = np.random.choice(state_names, size=rows)
df["Severity"] = np.random.choice(severity_values, size=rows, p=[0.3, 0.25, 0.45])
df["Sunrise Sunset"] = np.random.choice(sunrise_values, size=rows, p=[0.4, 0.3, 0.3])
df["Weather Cond."] = np.random.choice(weather_values, size=rows, p=[0.2, 0.3, 0.3, 0.2])

# Top 10 dangerous States to drive in  
dangerous_states = ["Wyoming","Mississippi","Montana","New Mexico", "North Dakota", "Oklahoma", "South Carolina",
                     "Alabama", "South Dakota", "Louisiana"]

# Distraction column takes values from 0 to 2 where 0 means no distraction, 1 means distarction
# and 2 means high level of distraction based on the above, dangerous states to drive in, list 
df["Distraction to Driver"] = np.random.choice(distraction_values, size=rows, p=[0.4, 0.3, 0.3])
a = df.loc[((df.Severity == 3) | (df.Severity == 2)) & (df.State.isin(dangerous_states))].shape[0]
df.loc[((df.Severity == 3) | (df.Severity == 2)) & (df.State.isin(dangerous_states)), "Distraction to Driver"] = [distraction_generator(1,2) for _ in range(a)]

# In States with big population we observe large amount of accidents through mornings due to traffic jam
condition = df.loc[((df.Severity == 1) | (df.Severity == 2) & (df.State.isin(states_with_big_population)))].shape[0]
df.loc[((df.Severity == 1) | (df.Severity == 2) & (df.State.isin(states_with_big_population))), "Sunrise Sunset"] = np.random.choice(sunrise_values, size=condition, p=[0.65, 0.15, 0.2])

# Based on "https://www.rospa.com/rospaweb/docs/advice-services/road-safety/drivers/driving-at-night.pdf" we have a large amount of
# accidents and more severe accidents at night despite the smaller amount of vehicles due to several conditions, so we give a boost
# to value "night" when we face accidents of severity{2,3} in the data set 
b = df.loc[((df.Severity == 3) | (df.Severity == 2))].shape[0]
p_list = [0.2, 0.2, 0.6]
df.loc[((df.Severity == 3) | (df.Severity == 2)), "Sunrise Sunset"] = [sunrise_generator(sunrise_values, 1, p_list) for _ in range(b)]

df.loc[((df.Severity == 3) | (df.Severity == 2))].sample(n=20)

Unnamed: 0,Accident ID,State,Severity,Weather Cond.,Distraction to Driver,Sunrise Sunset
247,5OH4P0S1IO,Indiana,3,Foggy,2,Evening
252,M7PPITXNLD,Colorado,3,Rainy,1,Evening
1,YHV4UEGCD7,South Dakota,3,Rainy,2,Evening
222,BX2I4ABO3X,Rhode Island,2,Dry,2,Evening
343,YYTM8CF9Z9,Massachusetts,2,Rainy,2,Night
374,GRPUOXPUM3,Nevada,3,Gale,0,Evening
90,QDJIQGEAQM,Maine,3,Foggy,2,Night
112,6J1HZCSCJO,Nevada,2,Dry,2,Night
77,ED75RG1RN4,Oregon,3,Dry,2,Night
312,R99V1H9KIS,Utah,3,Rainy,1,Day


In [5]:
pd.set_option("display.max_rows", 450)
print(df.groupby("Severity")["Sunrise Sunset"].value_counts())
print(df["Sunrise Sunset"].value_counts())

Severity  Sunrise Sunset
1         Day               95
          Night             18
          Evening           15
2         Night             40
          Day               37
          Evening           32
3         Night             63
          Evening           60
          Day               39
Name: Sunrise Sunset, dtype: int64
Day        171
Night      121
Evening    107
Name: Sunrise Sunset, dtype: int64


### Storing the dataframe in order to load it to the next notebook

In [6]:
%store df

Stored 'df' (DataFrame)
