In [84]:
import pandas as pd
import numpy as np

In [None]:
# Accident Characteristics
# 'Num_Acc': Accident ID
# 'jour': Day of the accident
# 'mois': Month of the accident
# 'an': Year of the accident
# 'hrmn': Time of the accident in hour and minutes (hhmm)
# 'lum': Lighting conditions in which the accident occurred
#     1 - Full day
#     2 - Twilight or dawn
#     3 - Night without public lighting
#     4 - Night with public lighting not lit
#     5 - Night with public lighting on
# 'dep': Department: INSEE Code of the department
# 'com': Municipality: The commune number by INSEE
# 'agg': Agglomeration
#     1 - Out of agglomeration
#     2 - In built-up areas
# 'int': Type of Intersection
#     1 - Out of intersection
#     2 - Intersection in X
#     3 - Intersection in T
#     4 - Intersection in Y
#     5 - Intersection with more than 4 branches
#     6 - Giratory
#     7 - Place
#     8 - Level crossing
#     9 - Other intersection
# 'atm': Atmospheric conditions
#     1 - Normal
#     2 - Light rain
#     3 - Heavy rain
#     4 - Snow - hail
#     5 - Fog - smoke
#     6 - Strong wind - storm
#     7 - Dazzling weather
#     8 - Cloudy weather
#     9 - Other
# 'col': Type of collision
#     1 - Two vehicles - frontal
#     2 - Two vehicles - from the rear
#     3 - Two vehicles - by the side
#     4 - Three vehicles and more - in chain
#     5 - Three or more vehicles - multiple collisions
#     6 - Other collision
#     7 - Without collision
# 'adr': Postal address for accidents in built-up areas
# 'gps': GPS coding
#     M - Métropole
#     A - Antilles (Martinique or Guadeloupe)
#     G - Guyane
#     R - Réunion
#     Y - Mayotte
# 'lat': Latitude
# 'long': Longitude


In [85]:
try:
    df_characteristics = pd.read_csv("caracteristics.csv", encoding='latin1')
    display(df_characteristics.head())
except UnicodeDecodeError:
    print("latin1 encoding didn't work. Trying another encoding...")

  df_characteristics = pd.read_csv("caracteristics.csv", encoding='latin1')


Unnamed: 0,Num_Acc,an,mois,jour,hrmn,lum,agg,int,atm,col,com,adr,gps,lat,long,dep
0,201600000001,16,2,1,1445,1,2,1,8.0,3.0,5.0,"46, rue Sonneville",M,0.0,0.0,590
1,201600000002,16,3,16,1800,1,2,6,1.0,6.0,5.0,1a rue du cimetière,M,0.0,0.0,590
2,201600000003,16,7,13,1900,1,1,1,1.0,6.0,11.0,,M,0.0,0.0,590
3,201600000004,16,8,15,1930,2,2,1,7.0,3.0,477.0,52 rue victor hugo,M,0.0,0.0,590
4,201600000005,16,12,23,1100,1,2,3,1.0,3.0,11.0,rue Joliot curie,M,0.0,0.0,590


In [86]:
df_characteristics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 839985 entries, 0 to 839984
Data columns (total 16 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   Num_Acc  839985 non-null  int64  
 1   an       839985 non-null  int64  
 2   mois     839985 non-null  int64  
 3   jour     839985 non-null  int64  
 4   hrmn     839985 non-null  int64  
 5   lum      839985 non-null  int64  
 6   agg      839985 non-null  int64  
 7   int      839985 non-null  int64  
 8   atm      839930 non-null  float64
 9   col      839974 non-null  float64
 10  com      839983 non-null  float64
 11  adr      699443 non-null  object 
 12  gps      366226 non-null  object 
 13  lat      362471 non-null  float64
 14  long     362467 non-null  object 
 15  dep      839985 non-null  int64  
dtypes: float64(4), int64(9), object(3)
memory usage: 102.5+ MB


In [87]:
df_holidays = pd.read_csv("holidays.csv", encoding='latin1')

display(df_holidays.head())

df_holidays.info()

Unnamed: 0,ds,holiday
0,2005-01-01,New year
1,2005-03-28,Easter Monday
2,2005-05-01,Labour Day
3,2005-05-05,Ascension Thursday
4,2005-05-08,Victory in Europe Day


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ds       132 non-null    object
 1   holiday  132 non-null    object
dtypes: object(2)
memory usage: 2.2+ KB


In [88]:
# Place Characteristics
# 'catr': Category of road
#     1 - Highway
#     2 - National Road
#     3 - Departmental Road
#     4 - Communal Way
#     5 - Off public network
#     6 - Parking lot open to public traffic
#     9 - Other
# 'voie': Road Number
# 'v1': Numeric index of the route number
# 'v2': Letter alphanumeric index of the road
# 'circ': Traffic regime
#     1 - One way
#     2 - Bidirectional
#     3 - Separated carriageways
#     4 - With variable assignment channels
# 'nbv': Total number of traffic lanes
# 'vosp': Indicates the existence of a reserved lane
#     1 - Bike path
#     2 - Cycle Bank
#     3 - Reserved channel
# 'prof': Longitudinal profile (gradient of the road)
#     1 - Dish
#     2 - Slope
#     3 - Hilltop
#     4 - Hill bottom
# 'pr': Home PR number (upstream terminal number)
# 'pr1': Distance in meters to the PR (relative to the upstream terminal)
# 'plan': Drawing in plan
#     1 - Straight part
#     2 - Curved on the left
#     3 - Curved right
#     4 - In "S"
# 'lartpc': Central solid land width (TPC) if there is
# 'larrout': Width of the roadway assigned to vehicle traffic
# 'surf': Surface condition
#     1 - Normal
#     2 - Wet
#     3 - Puddles
#     4 - Flooded
#     5 - Snow
#     6 - Mud
#     7 - Icy
#     8 - Fat - oil
#     9 - Other
# 'infra': Development - Infrastructure
#     1 - Underground - tunnel
#     2 - Bridge - autopont
#     3 - Exchanger or connection brace
#     4 - Railway
#     5 - Carrefour arranged
#     6 - Pedestrian area
#     7 - Toll zone
# 'situ': Situation of the accident
#     1 - On the road
#     2 - On emergency stop band
#     3 - On the verge
#     4 - On the sidewalk
#     5 - On bike path
# 'env1': School point: near a school

In [89]:
df_places = pd.read_csv("places.csv")

display(df_places.head())

df_places.info()

  df_places = pd.read_csv("places.csv")


Unnamed: 0,Num_Acc,catr,voie,v1,v2,circ,nbv,pr,pr1,vosp,prof,plan,lartpc,larrout,surf,infra,situ,env1
0,201600000001,3.0,39,,,2.0,0.0,,,0.0,1.0,3.0,0.0,0.0,1.0,0.0,1.0,0.0
1,201600000002,3.0,39,,,1.0,0.0,,,0.0,1.0,2.0,0.0,58.0,1.0,0.0,1.0,0.0
2,201600000003,3.0,1,,,2.0,2.0,,,0.0,1.0,3.0,0.0,68.0,2.0,0.0,3.0,99.0
3,201600000004,4.0,0,,,2.0,0.0,,,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,99.0
4,201600000005,4.0,0,,,0.0,0.0,,,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,3.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 839985 entries, 0 to 839984
Data columns (total 18 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   Num_Acc  839985 non-null  int64  
 1   catr     839984 non-null  float64
 2   voie     780914 non-null  object 
 3   v1       332816 non-null  float64
 4   v2       33953 non-null   object 
 5   circ     839187 non-null  float64
 6   nbv      838195 non-null  float64
 7   pr       414770 non-null  float64
 8   pr1      413463 non-null  float64
 9   vosp     838345 non-null  float64
 10  prof     838924 non-null  float64
 11  plan     838909 non-null  float64
 12  lartpc   830440 non-null  float64
 13  larrout  831706 non-null  float64
 14  surf     838968 non-null  float64
 15  infra    838707 non-null  float64
 16  situ     838983 non-null  float64
 17  env1     838709 non-null  float64
dtypes: float64(15), int64(1), object(2)
memory usage: 115.4+ MB


In [90]:
# User Characteristics
# 'catu': User category
#     1 - Driver
#     2 - Passenger
#     3 - Pedestrian
#     4 - Pedestrian in rollerblade or scooter
# 'grav': Severity of the accident
#     1 - Unscathed
#     2 - Killed
#     3 - Hospitalized wounded
#     4 - Light injury
# 'sex': Sex of the user
#     1 - Male
#     2 - Female
# 'Year_on': Year of birth of the user
# 'trip': Reason for traveling
#     1 - Home - work
#     2 - Home - school
#     3 - Shopping
#     4 - Professional use
#     5 - Promenade - leisure
#     9 - Other
# 'secu': Safety equipment (two characters: existence and use)
#     Existence
#         1 - Belt
#         2 - Helmet
#         3 - Children's device
#         4 - Reflective equipment
#         9 - Other
#     Use
#         1 - Yes
#         2 - No
#         3 - Not determinable
# 'locp': Location of the pedestrian
#     1 - A + 50 m from the pedestrian crossing
#     2 - A - 50 m from the pedestrian crossing
#     3 - On pedestrian crossing without light signaling
#     4 - On pedestrian crossing with light signaling
#     5 - On the sidewalk
#     6 - On the verge
#     7 - On refuge or BAU
#     8 - On against aisle
# 'actp': Action of the pedestrian
#     0 - Not specified or not applicable
#     1 - Moving in the direction of bumping vehicle
#     2 - Opposite direction of the vehicle
#     3 - Crossing
#     4 - Masked
#     5 - Playing - running
#     6 - With animal
#     9 - Other
# 'etatp': Injured pedestrian status
#     1 - Only
#     2 - Accompanied
#     3 - In a group

In [91]:
df_users = pd.read_csv("users.csv")

display(df_users.head())

df_users.info()

Unnamed: 0,Num_Acc,place,catu,grav,sexe,trajet,secu,locp,actp,etatp,an_nais,num_veh
0,201600000001,1.0,1,1,2,0.0,11.0,0.0,0.0,0.0,1983.0,B02
1,201600000001,1.0,1,3,1,9.0,21.0,0.0,0.0,0.0,2001.0,A01
2,201600000002,1.0,1,3,1,5.0,11.0,0.0,0.0,0.0,1960.0,A01
3,201600000002,2.0,2,3,1,0.0,11.0,0.0,0.0,0.0,2000.0,A01
4,201600000002,3.0,2,3,2,0.0,11.0,0.0,0.0,0.0,1962.0,A01


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1876005 entries, 0 to 1876004
Data columns (total 12 columns):
 #   Column   Dtype  
---  ------   -----  
 0   Num_Acc  int64  
 1   place    float64
 2   catu     int64  
 3   grav     int64  
 4   sexe     int64  
 5   trajet   float64
 6   secu     float64
 7   locp     float64
 8   actp     float64
 9   etatp    float64
 10  an_nais  float64
 11  num_veh  object 
dtypes: float64(7), int64(4), object(1)
memory usage: 171.8+ MB


In [92]:
# Vehicle Characteristics
# 'Num_Veh': Vehicle identifier
# 'GP': Flow direction
#     1 - PK or PR or increasing postal address number
#     2 - PK or PR or descending postal address number
# 'CATV': Category of vehicle
#     01 - Bicycle
#     02 - Moped <50cm3
#     03 - Cart (Quadricycle with bodied motor)
#     04 - Not used since 2006 (registered scooter)
#     05 - Not used since 2006 (motorcycle)
#     06 - Not used since 2006 (side-car)
#     07 - VL only
#     08 - Not used category (VL + caravan)
#     09 - Not used category (VL + trailer)
#     10 - VU only 1,5T <= GVW <= 3,5T with or without trailer
#     11 - Most used since 2006 (VU (10) + caravan)
#     12 - Most used since 2006 (VU (10) + trailer)
#     13 - PL only 3,5T <= GVW <= 7,5T

In [93]:
df_vehicles = pd.read_csv("vehicles.csv")

display(df_vehicles.head())

df_vehicles.info()

Unnamed: 0,Num_Acc,senc,catv,occutc,obs,obsm,choc,manv,num_veh
0,201600000001,0.0,7,0,0.0,0.0,1.0,1.0,B02
1,201600000001,0.0,2,0,0.0,0.0,7.0,15.0,A01
2,201600000002,0.0,7,0,6.0,0.0,1.0,1.0,A01
3,201600000003,0.0,7,0,0.0,1.0,6.0,1.0,A01
4,201600000004,0.0,32,0,0.0,0.0,1.0,1.0,B02


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1433389 entries, 0 to 1433388
Data columns (total 9 columns):
 #   Column   Non-Null Count    Dtype  
---  ------   --------------    -----  
 0   Num_Acc  1433389 non-null  int64  
 1   senc     1433317 non-null  float64
 2   catv     1433389 non-null  int64  
 3   occutc   1433389 non-null  int64  
 4   obs      1432627 non-null  float64
 5   obsm     1432788 non-null  float64
 6   choc     1433160 non-null  float64
 7   manv     1433083 non-null  float64
 8   num_veh  1433389 non-null  object 
dtypes: float64(5), int64(3), object(1)
memory usage: 98.4+ MB


In [94]:
# Merge dataframes on 'Num_Acc'
df = df_characteristics.merge(df_places, on='Num_Acc', how='left')
df = df.merge(df_vehicles, on='Num_Acc', how='left')
df = df.merge(df_users, on='Num_Acc', how='left')

#for mearging df_holidays we need to splt the Date given in the Dataframe trough pd.datetime before merging
# Convert the 'ds' column in df_holidays to datetime format
df_holidays['ds'] = pd.to_datetime(df_holidays['ds'])

# Extract the day of the month and add as a new column 'day'
df_holidays['jour'] = df_holidays['ds'].dt.day
df_holidays['mois'] = df_holidays['ds'].dt.month
df_holidays['an'] = df_holidays['ds'].dt.year


In [95]:
display(df_holidays.head())

display(df.head())

Unnamed: 0,ds,holiday,jour,mois,an
0,2005-01-01,New year,1,1,2005
1,2005-03-28,Easter Monday,28,3,2005
2,2005-05-01,Labour Day,1,5,2005
3,2005-05-05,Ascension Thursday,5,5,2005
4,2005-05-08,Victory in Europe Day,8,5,2005


Unnamed: 0,Num_Acc,an,mois,jour,hrmn,lum,agg,int,atm,col,...,catu,grav,sexe,trajet,secu,locp,actp,etatp,an_nais,num_veh_y
0,201600000001,16,2,1,1445,1,2,1,8.0,3.0,...,1,1,2,0.0,11.0,0.0,0.0,0.0,1983.0,B02
1,201600000001,16,2,1,1445,1,2,1,8.0,3.0,...,1,3,1,9.0,21.0,0.0,0.0,0.0,2001.0,A01
2,201600000001,16,2,1,1445,1,2,1,8.0,3.0,...,1,1,2,0.0,11.0,0.0,0.0,0.0,1983.0,B02
3,201600000001,16,2,1,1445,1,2,1,8.0,3.0,...,1,3,1,9.0,21.0,0.0,0.0,0.0,2001.0,A01
4,201600000002,16,3,16,1800,1,2,6,1.0,6.0,...,1,3,1,5.0,11.0,0.0,0.0,0.0,1960.0,A01


In [96]:
# Merge holidays dataframe
df = df.merge(df_holidays, on=['jour', 'mois', 'an'], how= 'outer')
display(df.head())

print(df.info())

Unnamed: 0,Num_Acc,an,mois,jour,hrmn,lum,agg,int,atm,col,...,sexe,trajet,secu,locp,actp,etatp,an_nais,num_veh_y,ds,holiday
0,201600000000.0,16,2,1,1445.0,1.0,2.0,1.0,8.0,3.0,...,2.0,0.0,11.0,0.0,0.0,0.0,1983.0,B02,NaT,
1,201600000000.0,16,2,1,1445.0,1.0,2.0,1.0,8.0,3.0,...,1.0,9.0,21.0,0.0,0.0,0.0,2001.0,A01,NaT,
2,201600000000.0,16,2,1,1445.0,1.0,2.0,1.0,8.0,3.0,...,2.0,0.0,11.0,0.0,0.0,0.0,1983.0,B02,NaT,
3,201600000000.0,16,2,1,1445.0,1.0,2.0,1.0,8.0,3.0,...,1.0,9.0,21.0,0.0,0.0,0.0,2001.0,A01,NaT,
4,201600000000.0,16,2,1,1430.0,1.0,1.0,1.0,6.0,1.0,...,1.0,5.0,11.0,0.0,0.0,0.0,1956.0,A01,NaT,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3554108 entries, 0 to 3554107
Data columns (total 54 columns):
 #   Column     Dtype         
---  ------     -----         
 0   Num_Acc    float64       
 1   an         int64         
 2   mois       int64         
 3   jour       int64         
 4   hrmn       float64       
 5   lum        float64       
 6   agg        float64       
 7   int        float64       
 8   atm        float64       
 9   col        float64       
 10  com        float64       
 11  adr        object        
 12  gps        object        
 13  lat        float64       
 14  long       object        
 15  dep        float64       
 16  catr       float64       
 17  voie       object        
 18  v1         float64       
 19  v2         object        
 20  circ       float64       
 21  nbv        float64       
 22  pr         float64       
 23  pr1        float64       
 24  vosp       float64       
 25  prof       float64       
 26  plan       flo

In [68]:
'''
The features we use:

Categorical Features:

    lum: Lighting conditions in which the accident occurred
    agg: Indicates whether the accident occurred in an agglomeration or not. (Categorical: Yes/No)
    int: Type of intersection where the accident happened. (Various intersection types: ..)
    atm: Atmospheric conditions at the time of the accident
    col: Type of collision that occurred. (Types of collision such as frontal, rear, side, etc.)
    catr: Category of the road where the accident occurred
    circ: Type of traffic circulation at the accident site
    prof: Road profile. (Types of road profiles: flat, slope, etc.)
    plan: Road layout. (Types of road layouts: straight, curved, etc.)
    surf: Road surface condition. (Surface conditions: dry, wet, etc.)
    infra: Road infrastructure. (Types of infrastructure: crosswalk, pedestrian lane, etc.)
    situ: Location of the accident site. (Locations: roadside, intersection, etc.)
    catu: Category of road users involved
    sexe: Gender of road users involved
    trajet: Purpose of the trip
    secu: Safety equipment used by road users
    actp: Action performed by the road user during the accident
    etatp: Condition of the road user during the accident
    senc: Direction of the vehicle involved in the accident
    catv: Category of the vehicle involved
    obs: Obstacle struck by the vehicle during the accident
    obsm: Obstacle marking
    choc: Impact configuration during the accident
    manv: Maneuver performed by the vehicle during the accident
    holiday: 

Numerical Features:

    nbv: Number of lanes
    occutc: Number of casualties in the vehicle
'''

'\nThe features we use:\nlum: Lighting conditions in which the accident occurred. It has different values indicating the level of illumination.\nagg: Indicates whether the accident occurred in an agglomeration or not.\nint: Type of intersection where the accident happened. It includes various intersection types.\natm: Atmospheric conditions at the time of the accident. It includes different weather conditions.\ncol: Type of collision that occurred, such as frontal, rear, side, etc.\ncatr: Category of the road where the accident occurred.\ncirc: Type of traffic circulation at the accident site.\nnbv: Number of lanes.\nprof: Road profile (flat, slope, etc.).\nplan: Road layout (straight, curved, etc.).\nsurf: Road surface condition (dry, wet, etc.).\ninfra: Road infrastructure (crosswalk, pedestrian lane, etc.).\nsitu: Location of the accident site (roadside, intersection, etc.).\ncatu: Category of road users involved in the accident.\nsexe: Gender of road users involved.\ntrajet: Purpos

In [99]:
# Define features and target
features = ['lum', 'agg', 'int', 'atm', 'col', 'catr', 'circ', 'nbv', 'prof', 'plan', 'surf', 'infra', 'situ', 
            'catu', 'sexe', 'trajet', 'secu', 'actp', 'etatp', 'senc', 'catv', 'occutc', 'obs', 'obsm', 'choc', 'manv']
target = 'grav'

In [100]:
df['holiday'].fillna(value="No holiday", inplace=True)

In [107]:
print(df[['jour', 'mois', 'an', 'hrmn']])

        jour mois    an  hrmn
0          1    2    16  1445
1          1    2    16  1445
2          1    2    16  1445
3          1    2    16  1445
4          1    2    16  1430
...      ...  ...   ...   ...
3554103   14    7  2016  0000
3554104   15    8  2016  0000
3554105    1   11  2016  0000
3554106   11   11  2016  0000
3554107   25   12  2016  0000

[3554108 rows x 4 columns]


In [106]:
#--Creating a column datetime
df['datetime_str'] = df['jour'].astype(str) + '/' + df['mois'].astype(str) + '/' + df['an'].astype(str) + ' ' + df['hrmn']
df['datetime'] = pd.to_datetime(df['datetime_str'], format='%d/%m/%Y %H:%M')

ValueError: time data "1/2/16 1445" doesn't match format "%d/%m/%Y %H:%M", at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [None]:
df.drop(columns=['datetime_str'], inplace=True)
df["datetime"].sort_values()

In [67]:
df.isna().sum()

Num_Acc          132
an                 0
mois               0
jour               0
hrmn             132
lum              132
agg              132
int              132
atm              340
col              182
com              148
adr           663738
gps          1954045
lat          1970459
long         1970479
dep              132
catr             134
voie          220599
v1           2129796
v2           3394458
circ            3375
nbv             7918
pr           1641425
pr1          1646755
vosp            7216
prof            4505
plan            4606
lartpc         40277
larrout        34825
surf            4344
infra           5593
situ            4329
env1            5620
senc             301
catv             132
occutc           132
obs             3776
obsm            1830
choc            1246
manv            2408
num_veh_x        132
place         104222
catu             132
grav             132
sexe             132
trajet           960
secu           46570
locp         

In [70]:
# Separate features and target
X = df[features]
y = df[target]

display(X.head())

Unnamed: 0,lum,agg,int,atm,col,catr,circ,nbv,prof,plan,...,secu,actp,etatp,senc,catv,occutc,obs,obsm,choc,manv
0,1.0,2.0,1.0,8.0,3.0,3.0,2.0,0.0,1.0,3.0,...,11.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,1.0,1.0
1,1.0,2.0,1.0,8.0,3.0,3.0,2.0,0.0,1.0,3.0,...,21.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,1.0,1.0
2,1.0,2.0,1.0,8.0,3.0,3.0,2.0,0.0,1.0,3.0,...,11.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,7.0,15.0
3,1.0,2.0,1.0,8.0,3.0,3.0,2.0,0.0,1.0,3.0,...,21.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,7.0,15.0
4,1.0,1.0,1.0,6.0,1.0,3.0,2.0,2.0,1.0,1.0,...,11.0,0.0,0.0,1.0,7.0,0.0,0.0,2.0,1.0,15.0


In [None]:
categorical_features = df[features].drop(['nbv', 'occutc'])
numerical_features = df[['nbv', 'occutc']]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Create  the model
model = RandomForestClassifier()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))