In [25]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix , auc, f1_score, roc_curve


In [2]:
df = pd.read_csv('./car_sensors.csv')
df.head(3)

Unnamed: 0,safe,S1,S2,S3,S4,S5,S6,S7,S8,S9,...,S13,S14,S15,S16,S17,S18,S19,S20,S21,S22
0,1,36.2247,10.7733,0.243897,596,100.671,0.0,0.0,1,28,...,1,57,0.0,0.28,240,5.99375,0,0.0,4,14.9382
1,1,35.7343,17.4551,0.243897,600,100.0,0.0,0.0,1,14,...,1,57,0.0,0.175,240,5.99375,0,0.0,4,14.8827
2,1,31.6561,7.61366,0.308763,604,99.3377,0.0,0.0,1,4,...,1,58,0.0,0.28,240,5.99375,0,0.0,4,14.6005


In [3]:
df.shape

(33239, 23)

In [4]:
print(df.columns)

Index(['safe', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10',
       'S11', 'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 'S19', 'S20',
       'S21', 'S22'],
      dtype='object')


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33239 entries, 0 to 33238
Data columns (total 23 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   safe    33239 non-null  int64  
 1   S1      33239 non-null  float64
 2   S2      33239 non-null  float64
 3   S3      33239 non-null  float64
 4   S4      33239 non-null  int64  
 5   S5      33239 non-null  float64
 6   S6      33239 non-null  float64
 7   S7      33239 non-null  float64
 8   S8      33239 non-null  int64  
 9   S9      33239 non-null  int64  
 10  S10     33239 non-null  float64
 11  S11     33239 non-null  int64  
 12  S12     33239 non-null  int64  
 13  S13     33239 non-null  int64  
 14  S14     33239 non-null  int64  
 15  S15     33239 non-null  float64
 16  S16     33239 non-null  float64
 17  S17     33239 non-null  int64  
 18  S18     33239 non-null  float64
 19  S19     33239 non-null  int64  
 20  S20     33239 non-null  float64
 21  S21     33239 non-null  int64  
 22

In [7]:
print(df.isnull().sum())

safe    0
S1      0
S2      0
S3      0
S4      0
S5      0
S6      0
S7      0
S8      0
S9      0
S10     0
S11     0
S12     0
S13     0
S14     0
S15     0
S16     0
S17     0
S18     0
S19     0
S20     0
S21     0
S22     0
dtype: int64


In [9]:
all_cols = df.describe()
print(all_cols)

               safe            S1            S2            S3             S4  \
count  33239.000000  33239.000000  33239.000000  33239.000000   33239.000000   
mean       0.575800     35.460193     12.036078      0.175710     837.136617   
std        0.494228      7.265388      3.749871      0.330817    2187.282185   
min        0.000000    -22.159000    -45.614600      0.038920     136.000000   
25%        0.000000     31.793350      9.915270      0.092110     668.000000   
50%        1.000000     34.156300     11.434900      0.105083     800.000000   
75%        1.000000     37.366450     13.721450      0.137516     900.000000   
max        1.000000    101.341000     71.154000     11.720000  228812.000000   

                 S5            S6            S7            S8            S9  \
count  33239.000000  33239.000000  33239.000000  33239.000000  33239.000000   
mean      77.981648     10.437655    103.317494      0.279190     -4.048918   
std       18.947660     13.960038    127.5

In [16]:
val_count = df['safe'].value_counts()
print(val_count)
#percentage of safe and unsafe
safe_pct = (val_count[0] / df.shape[0]) * 100
unsafe_pct = (val_count[1] / df.shape[0]) * 100
print(f"Safe Percentage: {safe_pct:.2f}%")
print(f"Unsafe Percentage: {unsafe_pct:.2f}%")

safe
1    19139
0    14100
Name: count, dtype: int64
Safe Percentage: 42.42%
Unsafe Percentage: 57.58%


In [21]:
X = df.drop('safe', axis=1)
y = df['safe']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=55)

print("X_train Shape ",X_train.shape)
print("y_train Shape ", y_train.shape)
print("X_test Shape ",X_test.shape)
print("y_test Shape ", y_test.shape)


X_train Shape  (26591, 22)
y_train Shape  (26591,)
X_test Shape  (6648, 22)
y_test Shape  (6648,)


In [29]:
# first building a decision tree classifier
dt_model = DecisionTreeClassifier(max_depth = 7)
dt_model.fit(X_train, y_train)

## accuracy on training data
tree_predict1 = dt_model.predict(X_train)
cm1 = confusion_matrix(y_train,tree_predict1)
accuracy_train=(cm1[0,0]+cm1[1,1])/sum(sum(cm1))
print("Decison Tree Accuracy on Train data = ", round(accuracy_train,2) )

## accuracy on test data
tree_predict2 = dt_model.predict(X_test)
cm2 = confusion_matrix(y_test,tree_predict2)
accuracy_test=(cm2[0,0]+cm2[1,1])/sum(sum(cm2))
print("Decison Tree Accuracy on Test data = ", round(accuracy_test,2) )

Decison Tree Accuracy on Train data =  0.88
Decison Tree Accuracy on Test data =  0.88


In [30]:
##AUC on Train data
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, tree_predict1)
auc_train = auc(false_positive_rate, true_positive_rate)
print("Decison Tree AUC on Train data = ", round(auc_train,2) )

##AUC on Test data
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, tree_predict2)
auc_test = auc(false_positive_rate, true_positive_rate)
print("Decison Tree AUC on Test data = ", round(auc_test,2) )

Decison Tree AUC on Train data =  0.87
Decison Tree AUC on Test data =  0.87


In [35]:
# now building a random forest classifier
rf_model = RandomForestClassifier(n_estimators=300, max_features=4, max_depth=10)
rf_model.fit(X_train, y_train)


In [36]:
# accuracy on training data
rf_predict1 = rf_model.predict(X_train)
cm1 = confusion_matrix(y_train,rf_predict1)
accuracy_train=(cm1[0,0]+cm1[1,1])/sum(sum(cm1))
print("Random Forest Accuracy on Train data = ", round(accuracy_train,2) )

# accuracy on test data
rf_predict2 = rf_model.predict(X_test)
cm2 = confusion_matrix(y_test,rf_predict2)
accuracy_test=(cm2[0,0]+cm2[1,1])/sum(sum(cm2))
print("Random Forest Accuracy on Test data = ", round(accuracy_test,2) )

Random Forest Accuracy on Train data =  0.92
Random Forest Accuracy on Test data =  0.91


In [37]:
##AUC on Train data
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, rf_predict1)
auc_train = auc(false_positive_rate, true_positive_rate)
print("Random Forest AUC on Train data =  ", round(auc_train,2) )

##AUC on Test data
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, rf_predict2)
auc_test= auc(false_positive_rate, true_positive_rate)
print("Random Forest AUC on Test data =  ", round(auc_test,2) )


Random Forest AUC on Train data =   0.91
Random Forest AUC on Test data =   0.9
