# `Part 2 - Translation & data mapping`

The purpose of this notebook is to translate, but also replace all integers used in the file by their actual meaning to perform a human-readable EDA

In [1]:
import pandas as pd
import numpy as np #numpy to manipulate numbers in this notebook
import os


# to access AWS
import aws_creds
import os
from smart_open import smart_open
import boto3 
from io import StringIO # python3; python2: BytesIO 

Helper function to check dataframe quality

In [2]:
#Helper functions - showing dataframe quality
def df_quality (df):
    df_qual= pd.DataFrame(df.dtypes, columns={'dtypes'}) # creating a colum to display all data types
    df_qual['isna%'] = np.round((df.isna().sum()/len(df))*100,1) # creating new columns = to missing values %
    df_qual['isna']= df.isna().sum()
    df_qual['unique']=df.nunique() # creating new columns eq to missing values
    return df_qual.sort_values(by= 'isna%' , ascending=False)[:40] # sorting and shortlisting

In [3]:
#importing our file from AWS
aws_key = aws_creds.aws_key
aws_secret = aws_creds.aws_secret

bucket_name = aws_creds.bucket
object_key = 'df_from_sql.csv'

path = 's3://{}:{}@{}/{}'.format(aws_key, aws_secret, bucket_name, object_key)

df = pd.read_csv(smart_open(path), index_col=[0]) # index_col=[0] to avoid unammed:0

  df = pd.read_csv(smart_open(path), index_col=[0]) # index_col=[0] to avoid unammed:0


In [4]:
ph = pd.read_csv('./extra_data/public_holidays.csv') #adding a local file ( public holidays ) to append to the one above

In [5]:
df.shape #checking main file shape

(367261, 53)

#### Translating features to english

In [6]:
#renaming columns headers to lowercase and to english

df = df.rename(columns={
    'Num_Acc':'acc_num', 
    'id_vehicule':'id_vehicule', #ppl
    'num_veh':'num_vehicule',#ppl 
    'place':'seat', #ppl
    'catu':'user_type', #ppl
    'grav':'injury_level', #ppl
    'sexe':'gender',#ppl
    'an_nais':'dob', #ppl
    'trajet':'journey_reason',#ppl 
    'secu1':'safety_feat_1', #ppl
    'secu2':'safety_feat_2', #ppl
    'secu3':'safety_feat_3', #ppl
    'locp':'pedestrian_loc', #ppl
    'actp':'pedestrian_action', #ppl
    'etatp':'pedestrian_group',#ppl
    'catr':'road_type', #loc
    'voie':'lane_num', #loc
    'v1':'road_num', #loc
    'v2':'road_alphanum',#loc 
    'circ':'travelling_direction',#loc 
    'nbv':'num_operating_lanes', #loc
    'vosp':'reserved_path', #loc
    'prof':'road_incline', #loc
    'pr':'milestone_num', #loc
    'pr1':'nearest_milestone_distance',#loc
    'plan':'road_curve', #loc
    'lartpc':'road_separator_width',#loc 
    'larrout':'width_road_only', #loc
    'surf':'road_surface', #loc
    'infra':'infra_type', #loc
    'situ':'road_location_accident',#loc 
    'vma':'max_speed_allowed', #loc
    'jour':'day',#specs
    'mois':'month', #specs
    'an':'year', #specs
    'hrmn':'hhmm', #specs
    'lum':'light_conditions', #specs
    'dep':'department_fr', #specs
    'com':'city', #specs
    'agg':'in_out_city', #specs
    'int':'crossing',#specs
    'atm':'weather', #specs
    'col':'collision_type',#specs
    'adr':'postal_address',#specs 
    'lat':'lat', #specs
    'long':'long', #specs
    'senc':'navigation_direction', #vehicule 
    'catv':'vehicule_type', #vehicule
    'obs':'fixed_obstacle_hurt', #vehicule
    'obsm':'mobile_obstacle_hurt', #vehicule
    'choc':'impact_location', #vehicule
    'manv':'was_doing',#vehicule
    'motor':'engine_type', #vehicule
    'occutc':'passenger_on_public_transport'#vehicule
}
)

In [7]:
df.head() #check

Unnamed: 0,acc_num,id_vehicule,seat,user_type,injury_level,gender,dob,journey_reason,safety_feat_1,safety_feat_2,...,light_conditions,department_fr,city,in_out_city,crossing,weather,collision_type,postal_address,lat,long
0,201900000001,138 306 524,1,1,4,2,1993.0,5,1,0,...,4,93,93053,1,1,1,2,AUTOROUTE A3,488962100,24701200
1,201900000001,138 306 524,2,2,4,2,2002.0,0,1,0,...,4,93,93053,1,1,1,2,AUTOROUTE A3,488962100,24701200
2,201900000001,138 306 525,1,1,1,1,1959.0,0,1,0,...,4,93,93053,1,1,1,2,AUTOROUTE A3,488962100,24701200
3,201900000002,138 306 523,1,1,4,2,1994.0,0,1,0,...,3,93,93066,1,1,1,6,AUTOROUTE A1,489307000,23688000
4,201900000003,138 306 520,1,1,1,1,1996.0,0,1,0,...,1,92,92036,1,1,1,4,AUTOROUTE A86,489358718,23191744


#### Transforming classes into english to perform a meaningful EDA

In [8]:
#Good, we have translated our columns ( aka features ). However, the classes stored in each feature are numeric values
# We need to map them to their meaning (ex: Genre 1 = M, 2 = Female) to visualize the data with readable labels 
df_master_key=pd.DataFrame()

with open("./data_dictionary/dictionary.txt") as f: # all classes translation are stored locally in a .txt file
    # first, we create a dataframe out of the mapping file
    for line in f:        
        df_row = pd.DataFrame(line.split()).T
        df_master_key = pd.concat([df_master_key,df_row])

#renaming columns headers
df_master_key = df_master_key.rename(columns={0:'masterkey',1:'key',2:'value'})

#creating empty python dictionary to iterate over
master_key_dictionary = {}

#storing parent keys ( columns headers)
categories = df_master_key['masterkey'].unique()

In [9]:
#for each of the columns of our new dataframe, we update our dictionary with the values contained in the dataframe
for category in categories:
    category_mask = df_master_key['masterkey']==category #get the unique columns
    df_category = df_master_key[category_mask][['key','value']].set_index('key') #    
    df_category.columns = [category]
    master_key_dictionary.update(df_category.to_dict())

In [10]:
# we make sure our dictionary contains the same datatype as the ones in the df
# transforming k into datatype integer to match datatypes of the dataframe at hand
for k,v in master_key_dictionary.items():
    for i,j in master_key_dictionary[k].items():
        i=np.int64(i)

In [11]:
#listing the columns to iterate over and apply the change
non_ordinal_feat = [
'light_conditions',
'crossing',
'weather',
'collision_type',
'road_type',
'travelling_direction',
'reserved_path',
'road_incline',
'road_curve',
'road_surface',
'infra_type',
'road_location_accident',
'navigation_direction',
'vehicule_type',
'fixed_obstacle_hurt',
'mobile_obstacle_hurt',
'impact_location',
'was_doing',
'engine_type',
'user_type',
'injury_level',
'gender',
'journey_reason',
'safety_feat_1',
'safety_feat_2',
'safety_feat_3',
'pedestrian_loc',
'pedestrian_action',
'pedestrian_group',
'in_out_city',
'seat'
]

In [12]:
#converting all non-ordinal features into strings otherwise the mapping will not work
for i in non_ordinal_feat:
    df[i]= df[i].apply(str)

In [13]:
df.head()

Unnamed: 0,acc_num,id_vehicule,seat,user_type,injury_level,gender,dob,journey_reason,safety_feat_1,safety_feat_2,...,light_conditions,department_fr,city,in_out_city,crossing,weather,collision_type,postal_address,lat,long
0,201900000001,138 306 524,1,1,4,2,1993.0,5,1,0,...,4,93,93053,1,1,1,2,AUTOROUTE A3,488962100,24701200
1,201900000001,138 306 524,2,2,4,2,2002.0,0,1,0,...,4,93,93053,1,1,1,2,AUTOROUTE A3,488962100,24701200
2,201900000001,138 306 525,1,1,1,1,1959.0,0,1,0,...,4,93,93053,1,1,1,2,AUTOROUTE A3,488962100,24701200
3,201900000002,138 306 523,1,1,4,2,1994.0,0,1,0,...,3,93,93066,1,1,1,6,AUTOROUTE A1,489307000,23688000
4,201900000003,138 306 520,1,1,1,1,1996.0,0,1,0,...,1,92,92036,1,1,1,4,AUTOROUTE A86,489358718,23191744


Applying the dictionary of english words to the integers. This is only to perform the EDA and read labels properly.<br>
Such a transformation makes the file grow by 3x as storing text is more engery intensive than int

In [14]:
# this loop transforms our entire dataset integer classes into string values in english 
for i in non_ordinal_feat:
    df[i]=df[i].map(master_key_dictionary[i]) 

In [15]:
df.head() # quick check

Unnamed: 0,acc_num,id_vehicule,seat,user_type,injury_level,gender,dob,journey_reason,safety_feat_1,safety_feat_2,...,light_conditions,department_fr,city,in_out_city,crossing,weather,collision_type,postal_address,lat,long
0,201900000001,138 306 524,driver,driver,OK,f,1993.0,leasure,safety_belt,no_equip,...,night,93,93053,out_of_city,crossing,normal,two_vehicules_back,AUTOROUTE A3,488962100,24701200
1,201900000001,138 306 524,main_pass_front,passenger,OK,f,2002.0,,safety_belt,no_equip,...,night,93,93053,out_of_city,crossing,normal,two_vehicules_back,AUTOROUTE A3,488962100,24701200
2,201900000001,138 306 525,driver,driver,OK,m,1959.0,,safety_belt,no_equip,...,night,93,93053,out_of_city,crossing,normal,two_vehicules_back,AUTOROUTE A3,488962100,24701200
3,201900000002,138 306 523,driver,driver,OK,f,1994.0,,safety_belt,no_equip,...,night,93,93066,out_of_city,crossing,normal,other_collision,AUTOROUTE A1,489307000,23688000
4,201900000003,138 306 520,driver,driver,OK,m,1996.0,,safety_belt,no_equip,...,bright_light,92,92036,out_of_city,crossing,normal,three_or_more_vehicules_chain,AUTOROUTE A86,489358718,23191744


Adding datetime formating ( needed to merge with public holidays df)

In [16]:
#creating a date column
df['date']=""
df['date']=pd.to_datetime(df[['year', 'month', 'day']])

#Adding weekday
df['day_of_week'] = df['date'].dt.day_name()

#adding a hour formatted properly 
df['hour'] = pd.to_datetime(df['hhmm'], format='%H:%M').dt.hour

df['YYYY-MM'] = df['year'].astype(str)+"-"+df['month'].astype(str)

#### Merging with public holidays df

In [17]:
#add on public holidays data to df using pd.merge
ph = ph.rename(columns={'Date':'date', 'Event':'event'}) 
ph['date'] = pd.to_datetime(ph['date'])
df = df.merge(ph, how  ="left", on='date', suffixes=(False,False))
df = df.drop(columns = 'Unnamed: 2')

  ph['date'] = pd.to_datetime(ph['date'])
  ph['date'] = pd.to_datetime(ph['date'])
  ph['date'] = pd.to_datetime(ph['date'])


#### Quick checks post-transformation

In [18]:
df['injury_level'].describe()

count     367201
unique         2
top           OK
freq      300996
Name: injury_level, dtype: object

In [19]:
df['injury_level'].value_counts() # checking each class distribution for our target variable: 'injury_level'

OK        300996
not_ok     66205
Name: injury_level, dtype: int64

In [20]:
df['injury_level'].isna().sum() # reasonable amount of nan

60

In [21]:
df_quality(df) #quick check of the quality of our df

Unnamed: 0,dtypes,isna%,isna,unique
road_separator_width,object,99.7,366242,59
safety_feat_3,object,98.9,363199,9
passenger_on_public_transport,float64,98.8,362920,29
event,object,97.2,356991,11
pedestrian_loc,object,92.7,340305,7
pedestrian_action,object,92.6,340168,7
road_alphanum,object,92.5,339643,44
pedestrian_group,object,92.3,338892,3
reserved_path,object,90.6,332675,2
infra_type,object,84.1,308930,9


In [22]:
#exporting translated df to AWS S3
bucket = aws_creds.bucket # already created on S3
csv_buffer = StringIO()
df.to_csv(csv_buffer)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'df_english.csv').put(Body=csv_buffer.getvalue()) # exporting the dataframe to a csv file as 'df_english.csv'

{'ResponseMetadata': {'RequestId': 'HBQF1J0EBS8N4EY4',
  'HostId': 'wrBQbLRaPMnCYfPe87+Ox6jiniNoZPOOlRKM7pHVYu28FVJvC2AM4F1QvQ4fVVkYHzrH8GMO2/o=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'wrBQbLRaPMnCYfPe87+Ox6jiniNoZPOOlRKM7pHVYu28FVJvC2AM4F1QvQ4fVVkYHzrH8GMO2/o=',
   'x-amz-request-id': 'HBQF1J0EBS8N4EY4',
   'date': 'Tue, 06 Dec 2022 11:31:57 GMT',
   'etag': '"b7375d3558d305156af360bd9e89ff14"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 1},
 'ETag': '"b7375d3558d305156af360bd9e89ff14"'}

We have performed the data conversion to english, and we are now almost ready to perform the EDA. In the next notebook, we will take a look at remaining data transformation, then jump on the EDA in PowerBi, and perform necessary features engineering before modeling