In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.externals import joblib



In [3]:
model = joblib.load('model.joblib')

In [4]:
df = pd.read_csv('../data/df_final.csv')

In [5]:
df = df.drop(columns='Unnamed: 0')

In [9]:
from sklearn.model_selection import train_test_split
trainval, test = train_test_split(df, test_size=0.2, random_state=42)

In [10]:
target = 'ACCIDENT'
train_features = trainval.drop(columns=[target, 'STATE', 'COUNTY'])
numeric_features = train_features.select_dtypes(include='number').columns.tolist()
categorical_features = train_features.select_dtypes(exclude='number').columns.tolist()
cardinality = train_features.select_dtypes(exclude='number').nunique()
high_cardinality_features = cardinality[cardinality > 100].index.tolist()
low_cardinality_features = cardinality[cardinality <= 100].index.tolist()
features = numeric_features + categorical_features

X_trainval = trainval[features]
y_trainval = trainval[target]
X_test = test[features]
y_test = test[target]

In [11]:
y_proba = model.predict_proba(X_test)[:, 1]

In [12]:
from sklearn.metrics import roc_auc_score, roc_curve

roc_auc_score(y_test, y_proba)

0.8550888319588322

In [19]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      1.00      0.92     80235
           1       0.94      0.29      0.44     20064

    accuracy                           0.85    100299
   macro avg       0.90      0.64      0.68    100299
weighted avg       0.87      0.85      0.82    100299



In [20]:
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

def plot_confusion_matrix(y_true, y_pred):
    labels = unique_labels(y_true)
    columns = [f'Predicted {label}' for label in labels]
    index = [f'Actual {label}' for label in labels]
    table = pd.DataFrame(confusion_matrix(y_true, y_pred), 
                         columns=columns, index=index)
    return sns.heatmap(table, annot=True, fmt='d', cmap='viridis')

In [21]:
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import interact, fixed
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.utils.multiclass import unique_labels

def set_threshold(y_true, y_pred_proba, threshold=0.1):
    """
    For binary classification problems. 
    y_pred_proba : predicted probability of class 1
    """
    
    # Apply threshold to predicted probabilities
    # to get discrete predictions
    class_0, class_1 = unique_labels(y_true)
    y_pred = np.full_like(y_true, fill_value=class_0)
    y_pred[y_pred_proba > threshold] = class_1
    
    # Plot distribution of predicted probabilities
    ax = sns.distplot(y_pred_proba)
    ax.axvline(threshold, color='red')
    plt.title('Distribution of predicted probabilities')
    plt.show()

    # Calculate true positive rate and false positive rate
    true_positives = (y_pred==y_true) & (y_pred==class_1)
    false_positives = (y_pred!=y_true) & (y_pred==class_1)
    actual_positives = (y_true==class_1)
    actual_negatives = (y_true==class_0)
    true_positive_rate = true_positives.sum() / actual_positives.sum()
    false_positive_rate = false_positives.sum() / actual_negatives.sum()
    print('False Positive Rate', false_positive_rate)
    print('True Positive Rate', true_positive_rate)
    
    # Plot ROC curve
    fpr, tpr, thresholds = roc_curve(y_true==class_1, y_pred_proba)
    plt.plot(fpr, tpr)
    plt.title('ROC curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    
    # Plot point on ROC curve for the current threshold
    plt.scatter(false_positive_rate, true_positive_rate)
    plt.show()
    
    # Show ROC AUC score
    print('Area under the Receiver Operating Characteristic curve:', 
          roc_auc_score(y_true, y_pred_proba))
    
    # Show confusion matrix & classification report
    plot_confusion_matrix(y_true, y_pred)
    print(classification_report(y_true, y_pred))

interact(set_threshold, 
         y_true=fixed(y_test), 
         y_pred_proba=fixed(y_proba), 
         threshold=(0,1,0.05));

interactive(children=(FloatSlider(value=0.1, description='threshold', max=1.0, step=0.05), Output()), _dom_cla…

## Flask API

In [34]:
def make_dataframe(input):
    column_headers = ['DAY', 'MONTH', 'YEAR', 'DAY_WEEK', 'LATITUDE', 'LONGITUD', 'HOUR',
                       'WEATHER', 'ROUTE', 'TYP_INT', 'TWAY_ID']
    X = pd.DataFrame(input, columns=column_headers)
    y = model.predict(X)
    y_pred_proba = model.predict_proba(X)[:, 1].tolist()
    return y_pred_proba

In [35]:
import json
import requestscolumn_headers = ['DAY', 'MONTH', 'YEAR', 'DAY_WEEK', 'LATITUDE', 'LONGITUD', 'HOUR',
                       'WEATHER', 'ROUTE', 'TYP_INT', 'TWAY_ID']

In [429]:
input = json.dumps(
   [
   {
      "DAY": 25,
      "MONTH": 5,
      "YEAR": 2016,
      "DAY_WEEK": 2,
      "LATITUDE": 30.748005970539843,
      "LONGITUD": -60.11728659260366,
      "HOUR": 3,
      "WEATHER": 2,
      "ROUTE": 6,
      "TYP_INT": 1,
      "TWAY_ID": "WOLF RIDGE RD"
   },
   {
      "DAY": 6,
      "MONTH": 8,
      "YEAR": 2017,
      "DAY_WEEK": 2,
      "LATITUDE": 32.54497821745906,
      "LONGITUD": -53.63684929388019,
      "HOUR": 6,
      "WEATHER": 1,
      "ROUTE": 2,
      "TYP_INT": 1,
      "TWAY_ID": "US-80 E TEXAS STREET"
   }
])

In [405]:
input = json.dumps(
    [{"DAY": 25, 
     "MONTH": 5, 
     "YEAR": 2016, 
     "DAY_WEEK": 2, 
     "LATITUDE": 30.748005970539843, 
     "LONGITUD": -88.11728659260366, 
     "HOUR": 3, 
     "WEATHER": 2,
     "ROUTE": 6,
     "TYP_INT": 1,
     "TWAY_ID": "WOLF RIDGE RD"}])

In [297]:
print(input)

[{"DAY": "25", "MONTH": "5", "YEAR": "2016", "DAY_WEEK": "2", "LATITUDE": "30.748005970539843", "LONGITUD": "-88.11728659260366", "HOUR": "3", "WEATHER": "2", "ROUTE": "6", "TYP_INT": "1", "TWAY_ID": "WOLF RIDGE RD"}, {"DAY": "6", "MONTH": "9", "YEAR": "2017", "DAY_WEEK": "2", "LATITUDE": "32.54497821745906", "LONGITUD": "-93.63684929388019", "HOUR": "6", "WEATHER": "1", "ROUTE": "2", "TYP_INT": "1", "TWAY_ID": "US-80 E TEXAS STREET"}]


In [399]:
input = json.loads(input)

In [314]:
type(input)

str

In [400]:
print(input)

[{'DAY': 25, 'MONTH': 5, 'YEAR': 2016, 'DAY_WEEK': 2, 'LATITUDE': 30.748005970539843, 'LONGITUD': -88.11728659260366, 'HOUR': 3, 'WEATHER': 2, 'ROUTE': 6, 'TYP_INT': 1, 'TWAY_ID': 'WOLF RIDGE RD'}]


In [401]:
data = []
for i in range(len(input)):
    for keys, values in input[i].items():
        observation = [input[i]['DAY'], input[i]['MONTH'], input[i]['YEAR'], input[i]['DAY_WEEK'], input[i]['LATITUDE'], 
                    input[i]['LONGITUD'], input[i]['HOUR'], input[i]['WEATHER'], input[i]['ROUTE'], input[i]['TYP_INT'], 
                    input[i]['TWAY_ID']]
    data.append(observation)

In [402]:
data

[[25,
  5,
  2016,
  2,
  30.748005970539843,
  -88.11728659260366,
  3,
  2,
  6,
  1,
  'WOLF RIDGE RD']]

In [403]:
X = pd.DataFrame(data, columns=column_headers)

In [404]:
make_dataframe(data)

[0.20049859583377838]

In [168]:
from datetime import datetime
today = datetime.today()

In [178]:
today.weekday()

1

In [197]:
observation = [[today.day, today.month, today.year, today.weekday(), float(input['LATITUDE']), 
                float(input['LONGITUD']), int(input['HOUR']), int(input['WEATHER']), int(input['ROUTE']), 
                int(input['TYP_INT']), input['TWAY_ID']]]

In [198]:
observation

[[24,
  9,
  2019,
  1,
  30.748005970539843,
  -88.11728659260366,
  3,
  2,
  6,
  1,
  'WOLF RIDGE RD']]

In [303]:
result = make_dataframe(data)

In [305]:
result[0]

0.20049859583377838

In [None]:
prediction_[i] = {
    "LATITUDE": input['LONGITUD'],
    "LONGITUDE": input['LATITUDE'],
    "PROBABILITY OF ACCIDENT": result[i]
}

In [327]:
def prediction(input):
    data = []
    pred = {}
    for i in range(len(input)):
        for keys, values in input[i].items():
            observation = [today.day, today.month, today.year, today.weekday(), float(input[i]['LATITUDE']), 
                        float(input[i]['LONGITUD']), int(input[i]['HOUR']), int(input[i]['WEATHER']), 
                        int(input[i]['ROUTE']), int(input[i]['TYP_INT']), input[i]['TWAY_ID']]
        data.append(observation)
        pred[i] = {
        "LATITUDE": input[i]['LONGITUD'],
        "LONGITUDE": input[i]['LATITUDE'],
        "PROBABILITY OF ACCIDENT": result[i]
        }
    return pred

In [347]:
prediction(input)

AttributeError: 'str' object has no attribute 'items'

In [423]:
url = "http://localhost:5000/api"

In [430]:
send = requests.post(url, input)

In [431]:
print(send.json())

{'0': {'LATITUDE': 30.748005970539843, 'LONGITUD': -60.11728659260366, 'PROBABILITY OF ACCIDENT': '24.93%', 'RISK LEVEL': 'Moderate to High'}, '1': {'LATITUDE': 32.54497821745906, 'LONGITUD': -53.63684929388019, 'PROBABILITY OF ACCIDENT': '20.06%', 'RISK LEVEL': 'Moderate to High'}}


In [None]:
No Additional Atmospheric Conditions 0
Clear 1
Cloudy 10
Rain 2
Sleet or Hail 3
Freezing Rain or Drizzle 12
Snow 4
Blowing Snow 11
Fog, Smog, Smoke 5
Severe Crosswinds 6
Blowing Sand, Soil, Dirt 7
Other 8
Not Reported 98
Unknown 99

In [None]:
if main =
thunderstorm, then 2
if main = Drizzle, then 12
if main = Rain, then 2
if main = Snow, then 4
if main = Dust, then 7
if main = Sand, then 7
if main = Smoke, then 5
if main = Clear, then 1
if main = Clouds, then 10
if main = Fog, then 5


In [433]:
import requests

In [465]:
URL = "http://api.openweathermap.org/data/2.5/weather?lat={lat}&lon={lon}&APPID=ae2fff8ef560467a665dab34e9784d4c"

In [474]:
PARAMS = {'lat': 32, 'lon': 53}

In [475]:
r = requests.get(url = URL, params = PARAMS)

In [476]:
data = r.json() 

In [477]:
data

{'cod': '400', 'message': '{lat} is not a float'}