In [120]:
import pandas as pd
import numpy as np
import pymysql as mysql
from sqlalchemy import create_engine

--------------------- LOADING DATA FROM DATASET ---------------------

In [121]:
requiredFields = ['CRASH_DATE', 'CRASH_TIME', 'PERSON_INJURY', 'BODILY_INJURY', 'PERSON_AGE', 'PERSON_SEX', 'PERSON_TYPE', 'PED_LOCATION']

In [122]:
# Reading data set to start with data wrangling
nyc_start_dataset = pd.read_csv('./NYC_Motor_Vehicle_Collisions_to_Person.csv', usecols=requiredFields)

In [123]:
# Showing starting dataset
nyc_start_dataset

Unnamed: 0,CRASH_DATE,CRASH_TIME,PERSON_INJURY,PERSON_AGE,BODILY_INJURY,PERSON_SEX,PERSON_TYPE,PED_LOCATION
0,2021-05-02,21:00,Killed,62.0,Head,F,Pedestrian,Pedestrian/Bicyclist/Other Pedestrian at Inter...
1,2021-05-21,0:00,Killed,24.0,Entire Body,M,Occupant,
2,2021-10-15,2:00,Killed,30.0,Head,M,Occupant,
3,2021-04-17,13:00,Killed,71.0,Head,M,Pedestrian,Pedestrian/Bicyclist/Other Pedestrian Not at I...
4,2021-05-25,22:00,Killed,69.0,Entire Body,F,Pedestrian,Pedestrian/Bicyclist/Other Pedestrian Not at I...
...,...,...,...,...,...,...,...,...
45664,2021-05-07,0:00,Injured,24.0,Back,M,Occupant,
45665,2021-04-27,13:00,Injured,29.0,Knee-Lower Leg Foot,M,Occupant,
45666,2021-05-07,14:00,Injured,7.0,Head,F,Occupant,
45667,2021-05-10,16:00,Injured,13.0,Elbow-Lower-Arm-Hand,F,Occupant,


In [124]:
nyc_start_dataset.dtypes

CRASH_DATE        object
CRASH_TIME        object
PERSON_INJURY     object
PERSON_AGE       float64
BODILY_INJURY     object
PERSON_SEX        object
PERSON_TYPE       object
PED_LOCATION      object
dtype: object

--------------------- END LOADING DATA FROM DATASET --------------------- 

--------------------- DATAWRANGLING ---------------------

In [125]:
# Replace all Nan values with stablished data
nyc_start_dataset['CRASH_DATE'] = nyc_start_dataset['CRASH_DATE'].fillna('2021-01-01');
nyc_start_dataset['CRASH_TIME'] = nyc_start_dataset['CRASH_TIME'].fillna('00:00');
nyc_start_dataset['PERSON_INJURY'] = nyc_start_dataset['PERSON_INJURY'].fillna('None');
nyc_start_dataset['PERSON_AGE'] = nyc_start_dataset['PERSON_AGE'].fillna(0);
nyc_start_dataset['PERSON_SEX'] = nyc_start_dataset['PERSON_SEX'].fillna('U');
nyc_start_dataset['PERSON_TYPE'] = nyc_start_dataset['PERSON_TYPE'].fillna('Unknown');
nyc_start_dataset['BODILY_INJURY'] = nyc_start_dataset['BODILY_INJURY'].fillna('Does Not Apply');
nyc_start_dataset['PED_LOCATION'] = nyc_start_dataset['PED_LOCATION'].fillna('Not Pedestrian');

In [126]:
# Clean ages on negative ages
nyc_start_dataset['PERSON_AGE'][nyc_start_dataset['PERSON_AGE'] < 0] = 0;
# Clean the last logical age being 121 
nyc_start_dataset['PERSON_AGE'][nyc_start_dataset['PERSON_AGE'] > 121] = 121;


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nyc_start_dataset['PERSON_AGE'][nyc_start_dataset['PERSON_AGE'] < 0] = 0;
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nyc_start_dataset['PERSON_AGE'][nyc_start_dataset['PERSON_AGE'] > 121] = 121;


In [127]:
# Clean all sex
nyc_start_dataset[(nyc_start_dataset['PERSON_SEX'] != 'M') & (nyc_start_dataset['PERSON_SEX'] != 'F') & (nyc_start_dataset['PERSON_SEX'] != 'U')]

Unnamed: 0,CRASH_DATE,CRASH_TIME,PERSON_INJURY,PERSON_AGE,BODILY_INJURY,PERSON_SEX,PERSON_TYPE,PED_LOCATION


In [128]:
dim_sex = pd.DataFrame({
  'sex': nyc_start_dataset['PERSON_SEX'].drop_duplicates().reset_index(drop=True),
  'binary': (np.where(nyc_start_dataset['PERSON_SEX'].drop_duplicates() == 'U', 0, 1))})
dim_sex.insert(0, 'id_sex', range(1, len(dim_sex) + 1))

In [129]:
dim_sex

Unnamed: 0,id_sex,sex,binary
0,1,F,1
1,2,M,1
2,3,U,0


In [130]:
dim_ages = pd.DataFrame({
  'age': nyc_start_dataset['PERSON_AGE'].drop_duplicates().reset_index(drop=True)
})
dim_ages.insert(0, 'id_age', range(1, len(dim_ages) + 1))

In [131]:
dim_ages

Unnamed: 0,id_age,age
0,1,62.0
1,2,24.0
2,3,30.0
3,4,71.0
4,5,69.0
...,...,...
97,98,87.0
98,99,93.0
99,100,94.0
100,101,100.0


In [132]:
dim_date = nyc_start_dataset[['CRASH_DATE', 'CRASH_TIME']].groupby(['CRASH_DATE', 'CRASH_TIME']).size().reset_index()[['CRASH_DATE', 'CRASH_TIME']]
dim_date.columns = ['date', 'hour']
dim_date['date_hour'] = dim_date['date'] + ' ' + dim_date['hour']
dim_date.insert(0, 'id_date', range(1, len(dim_date) + 1))

In [133]:
dim_date

Unnamed: 0,id_date,date,hour,date_hour
0,1,2021-01-01,0:00,2021-01-01 0:00
1,2,2021-01-01,10:00,2021-01-01 10:00
2,3,2021-01-01,11:00,2021-01-01 11:00
3,4,2021-01-01,12:00,2021-01-01 12:00
4,5,2021-01-01,14:00,2021-01-01 14:00
...,...,...,...,...
7094,7095,2021-11-16,5:00,2021-11-16 5:00
7095,7096,2021-11-16,6:00,2021-11-16 6:00
7096,7097,2021-11-16,7:00,2021-11-16 7:00
7097,7098,2021-11-16,8:00,2021-11-16 8:00


In [134]:
dim_injuries = nyc_start_dataset[['PERSON_INJURY', 'BODILY_INJURY']].groupby(['PERSON_INJURY', 'BODILY_INJURY']).size().reset_index()[['PERSON_INJURY', 'BODILY_INJURY']]
dim_injuries.columns = ['person_injury', 'bodily_injury']
dim_injuries.insert(0, 'id_injury', range(1, len(dim_injuries) + 1))

In [135]:
dim_injuries

Unnamed: 0,id_injury,person_injury,bodily_injury
0,1,Injured,Abdomen - Pelvis
1,2,Injured,Back
2,3,Injured,Chest
3,4,Injured,Does Not Apply
4,5,Injured,Elbow-Lower-Arm-Hand
5,6,Injured,Entire Body
6,7,Injured,Eye
7,8,Injured,Face
8,9,Injured,Head
9,10,Injured,Hip-Upper Leg


In [136]:
dim_person_type = nyc_start_dataset[['PERSON_TYPE', 'PED_LOCATION']].groupby(['PERSON_TYPE', 'PED_LOCATION']).size().reset_index()[['PERSON_TYPE', 'PED_LOCATION']]
dim_person_type.columns = ['type', 'location']
dim_person_type.insert(0, 'id_person_type', range(1, len(dim_person_type) + 1))

In [137]:
dim_person_type

Unnamed: 0,id_person_type,type,location
0,1,Bicyclist,Not Pedestrian
1,2,Occupant,Does Not Apply
2,3,Occupant,Not Pedestrian
3,4,Occupant,Pedestrian/Bicyclist/Other Pedestrian Not at I...
4,5,Occupant,Pedestrian/Bicyclist/Other Pedestrian at Inter...
5,6,Occupant,Unknown
6,7,Other Motorized,Not Pedestrian
7,8,Pedestrian,Does Not Apply
8,9,Pedestrian,Pedestrian/Bicyclist/Other Pedestrian Not at I...
9,10,Pedestrian,Pedestrian/Bicyclist/Other Pedestrian at Inter...


In [147]:
fact_accidents = pd.DataFrame({
  'id_accident': [],
  'id_ages': [],
  'id_date': [],
  'id_injuries': [],
  'id_person_type': [],
  'id_sex': [],
})

--------------------- END DATAWRANGLING ---------------------

--------------------- DATAMART ---------------------

In [138]:
try: connection = mysql.connect(host='localhost', user='root', password='nintendo123'); print ('Succesfully connected to MySQL Server')
except: print ('Cannot connect to MySQL Server')

Succesfully connected to MySQL Server


In [139]:
# Creates new cursor from MySQL connection
cur = connection.cursor()
# Drops database to fill new data
try:
  cur.execute("DROP DATABASE IF EXISTS dm_datamart")
  cur.execute("CREATE DATABASE dm_datamart")
  connection.commit()
except:
  print ("Query Error")
cur.close() 

In [140]:
# Creates engine for database "dm_datamart"
engine = create_engine('mysql+pymysql://root:nintendo123@localhost/dm_datamart')

In [141]:
dim_ages.to_sql('dim_ages', engine, if_exists='replace', index=False)

In [142]:
dim_sex.to_sql('dim_sex', engine, if_exists='replace', index=False)

In [143]:
dim_date.to_sql('dim_date', engine, if_exists='replace', index=False)

In [144]:
dim_injuries.to_sql('dim_injuries', engine, if_exists='replace', index=False)

In [145]:
dim_person_type.to_sql('dim_person_type', engine, if_exists='replace', index=False)

In [None]:
fact_accidents.to_sql('fact_accidents', engine, if_exists='replace', index=False)

--------------------- DATAMART ---------------------

In [146]:
nyc_start_dataset.to_sql('fact_temp_table', engine, if_exists='replace', index=False)