In [1]:
import pandas as P

In [2]:
D = P.read_csv("Nigeria Crash Data.csv")

In [3]:
print(D.columns)

Index(['Quarter', 'State', 'Total_Crashes', 'Num_Injured', 'Num_Killed',
       'Total_Vehicles_Involved', 'SPV', 'DAD', 'PWR', 'FTQ', 'Other_Factors'],
      dtype='object')


In [4]:
Missing_Values = D.isnull().sum()
print(Missing_Values)

Quarter                    0
State                      0
Total_Crashes              0
Num_Injured                0
Num_Killed                 0
Total_Vehicles_Involved    0
SPV                        0
DAD                        0
PWR                        0
FTQ                        0
Other_Factors              0
dtype: int64


In [5]:
print(D.dtypes)

Quarter                    object
State                      object
Total_Crashes               int64
Num_Injured                 int64
Num_Killed                  int64
Total_Vehicles_Involved     int64
SPV                         int64
DAD                         int64
PWR                         int64
FTQ                         int64
Other_Factors               int64
dtype: object


In [6]:
D["Quarter"] = P.to_numeric(D["Quarter"] , errors = "coerce")

In [7]:
D["State"] = P.to_numeric(D["State"] , errors = "coerce")

In [8]:
print(D.dtypes)

Quarter                    float64
State                      float64
Total_Crashes                int64
Num_Injured                  int64
Num_Killed                   int64
Total_Vehicles_Involved      int64
SPV                          int64
DAD                          int64
PWR                          int64
FTQ                          int64
Other_Factors                int64
dtype: object


In [9]:
Drop_Duplicates = D.drop_duplicates()
print(Drop_Duplicates)

     Quarter  State  Total_Crashes  Num_Injured  Num_Killed  \
0        NaN    NaN             30          146          31   
1        NaN    NaN             77          234          36   
2        NaN    NaN             22           28           7   
3        NaN    NaN             72          152          20   
4        NaN    NaN            154          685          90   
..       ...    ...            ...          ...         ...   
513      NaN    NaN             15           32           4   
514      NaN    NaN             24          122          41   
515      NaN    NaN             38           98          17   
516      NaN    NaN             39          234          13   
517      NaN    NaN             13           61          14   

     Total_Vehicles_Involved  SPV  DAD  PWR  FTQ  Other_Factors  
0                         37   19    0    0    0             18  
1                         94   57    0    0    0             37  
2                         24   15    0    0  

In [10]:
Check_Negetive = (D < 0).any ()

In [11]:
print(Check_Negetive)

Quarter                    False
State                      False
Total_Crashes              False
Num_Injured                False
Num_Killed                 False
Total_Vehicles_Involved    False
SPV                        False
DAD                        False
PWR                        False
FTQ                        False
Other_Factors               True
dtype: bool


In [12]:
print(D.columns)

Index(['Quarter', 'State', 'Total_Crashes', 'Num_Injured', 'Num_Killed',
       'Total_Vehicles_Involved', 'SPV', 'DAD', 'PWR', 'FTQ', 'Other_Factors'],
      dtype='object')


In [13]:
D["Other_Factors"] = D["Other_Factors"].abs()

In [14]:
Q1 = D.quantile(0.25)
Q3 = D.quantile(0.75)
IQR = Q3 - Q1


In [15]:
Outlier = (D < (Q1 - 1.5 * IQR)) | (D > (Q3 + 1.5 * IQR))
print(Outlier)
print(Outlier.sum())

     Quarter  State  Total_Crashes  Num_Injured  Num_Killed  \
0      False  False          False        False       False   
1      False  False          False        False       False   
2      False  False          False        False       False   
3      False  False          False        False       False   
4      False  False          False         True       False   
..       ...    ...            ...          ...         ...   
513    False  False          False        False       False   
514    False  False          False        False       False   
515    False  False          False        False       False   
516    False  False          False        False       False   
517    False  False          False        False       False   

     Total_Vehicles_Involved    SPV    DAD    PWR    FTQ  Other_Factors  
0                      False  False  False  False  False          False  
1                      False  False  False  False  False          False  
2                    

In [16]:
L = Q1 - 1.5 * IQR
U = Q3 + 1.5 * IQR
D = D.clip(lower = L, upper = U, axis=1)


In [17]:
print(D.columns)

Index(['Quarter', 'State', 'Total_Crashes', 'Num_Injured', 'Num_Killed',
       'Total_Vehicles_Involved', 'SPV', 'DAD', 'PWR', 'FTQ', 'Other_Factors'],
      dtype='object')


In [18]:
Columns_Drop = D["Other_Factors"]


In [19]:
D.drop(columns = ["Other_Factors"] , inplace = True)

In [20]:
Category = D.select_dtypes(include = "object").columns

In [21]:
print(Category)

Index([], dtype='object')


In [22]:
from sklearn.preprocessing import LabelEncoder

In [23]:
L = LabelEncoder ()

In [24]:
D[Category] = D[Category].apply(L.fit_transform)

In [25]:
C = D.select_dtypes(include='number').columns


In [26]:
print(C)

Index(['Quarter', 'State', 'Total_Crashes', 'Num_Injured', 'Num_Killed',
       'Total_Vehicles_Involved', 'SPV', 'DAD', 'PWR', 'FTQ'],
      dtype='object')


In [27]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()


In [28]:
D.isnull().sum()


D = D.fillna(0)

In [29]:
D_scaled = D.copy()
D_scaled[C] = scaler.fit_transform(D_scaled[C])


In [30]:
print(D_scaled)

     Quarter  State  Total_Crashes  Num_Injured  Num_Killed  \
0        0.0    0.0      -0.789933    -0.508843   -0.251305   
1        0.0    0.0      -0.023955    -0.033263   -0.091148   
2        0.0    0.0      -0.920312    -1.146552   -1.020061   
3        0.0    0.0      -0.105442    -0.476417   -0.603651   
4        0.0    0.0       1.230946     2.400706    1.638553   
..       ...    ...            ...          ...         ...   
513      0.0    0.0      -1.034394    -1.124935   -1.116155   
514      0.0    0.0      -0.887717    -0.638547    0.069010   
515      0.0    0.0      -0.659553    -0.768250   -0.699746   
516      0.0    0.0      -0.643256    -0.033263   -0.827872   
517      0.0    0.0      -1.066989    -0.968210   -0.795840   

     Total_Vehicles_Involved       SPV  DAD  PWR       FTQ  
0                  -0.692582 -0.346619  0.0  0.0 -0.671464  
1                   0.019787  0.573584  0.0  0.0 -0.671464  
2                  -0.855053 -0.443482  0.0  0.0 -0.621067  

In [33]:
print(D.columns)

Index(['Quarter', 'State', 'Total_Crashes', 'Num_Injured', 'Num_Killed',
       'Total_Vehicles_Involved', 'SPV', 'DAD', 'PWR', 'FTQ'],
      dtype='object')


In [38]:
X = D.drop(columns = ["Num_Killed"] , axis = 1)
Y = D["Num_Killed"]

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
X_Train , X_Test , Y_Train , Y_Test = train_test_split(X , Y , test_size = 0.2 , random_state = 42)

In [41]:
C_Final_Null = D.isnull().sum()
print(C_Final_Null)

Quarter                    0
State                      0
Total_Crashes              0
Num_Injured                0
Num_Killed                 0
Total_Vehicles_Involved    0
SPV                        0
DAD                        0
PWR                        0
FTQ                        0
dtype: int64


In [43]:
C_Final_Text = D.select_dtypes(include = "object").columns

In [44]:
print(C_Final_Text)

Index([], dtype='object')


In [45]:
Save_File = D.to_csv("Nigeria Crash Data (New).csv")