In [34]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline
df = pd.read_csv('dataset.csv')

In [35]:
df.head()

Unnamed: 0,GridID,date,Shift,Accident,Longitude.grid,Latitude.grid
0,1,2010-10-08,Night,0,2.08,41.41
1,1,2011-02-16,Morning,0,2.08,41.41
2,1,2014-05-31,Night,0,2.08,41.41
3,1,2011-04-03,Afternoon,0,2.08,41.41
4,1,2013-02-20,Morning,0,2.08,41.41


In [36]:
df = df[['GridID', 'Longitude.grid', 'Latitude.grid','date','Shift','Accident']]
df['date'] = pd.to_datetime(df['date'])
q = {'Morning':0,'Afternoon':1,'Night':2}
df['Shift']= df['Shift'].map(q)
df.head()

Unnamed: 0,GridID,Longitude.grid,Latitude.grid,date,Shift,Accident
0,1,2.08,41.41,2010-10-08,2,0
1,1,2.08,41.41,2011-02-16,0,0
2,1,2.08,41.41,2014-05-31,2,0
3,1,2.08,41.41,2011-04-03,1,0
4,1,2.08,41.41,2013-02-20,0,0


In [37]:
df = df.drop("date",1)

In [38]:
X = np.array(df.drop(['Accident'],1))
y = np.array(df['Accident'])
X.shape

(824587, 4)

In [39]:
model = linear_model.LogisticRegression()
model.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [40]:
validation_size = 0.20
seed = 7
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=validation_size, random_state=seed)

In [41]:
name = 'Logistic Regression'
kfold = model_selection.KFold(n_splits=10,random_state=seed)
cv_results = model_selection.cross_val_score(model,X_train,y_train,cv=kfold,scoring='accuracy')
msg="%s: %f (%f)" %(name,cv_results.mean(),cv_results.std())
print(msg)

Logistic Regression: 0.955605 (0.000985)


In [42]:
predictions = model.predict(X)
print(predictions[0:5])

[0 0 0 0 0]


In [43]:
model.score(X,y)

0.9557148002575835

In [44]:
from sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


In [46]:
from sklearn.utils import resample
df_majority = df[df.Accident == 0]
df_minority = df[df.Accident == 1]

df_majority_downsampled = resample(df_majority,
                                replace = False,
                                n_samples= 36517,
                                random_state = 123)

df_downsampled = pd.concat([df_majority_downsampled,df_minority])

df_downsampled.Accident.value_counts()

1    36517
0    36517
Name: Accident, dtype: int64

In [47]:
X = df_downsampled.drop('Accident', axis=1)
y = df_downsampled.Accident

clf_4 = RandomForestClassifier()
clf_4.fit(X,y)

pred_y_4 = clf_4.predict(X)

print(np.unique(pred_y_4))

[0 1]


In [48]:
print(accuracy_score(y,pred_y_4))

0.7465016293780978


In [50]:
from sklearn.metrics import roc_auc_score
prob_y_4 = clf_4.predict_proba(X)
prob_y_4 = [p[1] for p in prob_y_4]
print(roc_auc_score(y,prob_y_4))

0.8242496359494405
