# Model taken

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import snowflake.connector
from sqlalchemy import create_engine
from snowflake.sqlalchemy import URL
import sqlalchemy.dialects.sqlite
import matplotlib.pyplot as plt
from datetime import date,timedelta
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import seaborn as sns
from random import seed
from random import random
import plotly.graph_objects as go
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.pipeline import Pipeline as Pipeline_imb
from imblearn.under_sampling import RandomUnderSampler
import pickle
from collections import Counter
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

## Dataframe

In [2]:
df = pd.read_csv('C:/Users/Usuario/Downloads/orders.csv', sep=",")
df.head()

Unnamed: 0,order_id,store_id,to_user_distance,to_user_elevation,total_earning,created_at,taken
0,14364873,30000009,2.478101,-72.71936,4200,2017-09-07T20:02:17Z,0
1,14370123,30000058,0.451711,37.754761,4200,2017-09-07T20:13:16Z,0
2,14368534,900003684,2.026072,207.191162,4000,2017-09-07T20:07:23Z,0
3,14370258,900014452,2.671432,1.722656,4400,2017-09-07T20:15:19Z,1
4,14369923,900014085,0.965496,117.429199,3450,2017-09-07T20:12:14Z,1


In [3]:
df.isnull().sum()

order_id             0
store_id             0
to_user_distance     0
to_user_elevation    0
total_earning        0
created_at           0
taken                0
dtype: int64

In [4]:
df['dates'] = pd.to_datetime(df['created_at'])

In [5]:
df['weekday'] = df['dates'].dt.weekday_name

In [6]:
df.isnull().sum()

order_id             0
store_id             0
to_user_distance     0
to_user_elevation    0
total_earning        0
created_at           0
taken                0
dates                0
weekday              0
dtype: int64

## Análisis descriptivo

In [None]:
numerical_features = df[['to_user_distance', 'to_user_elevation', 'total_earning']]

In [None]:
plt.matshow(numerical_features.corr())
plt.xticks(range(len(numerical_features.columns)), numerical_features.columns,rotation=90)
plt.yticks(range(len(numerical_features.columns)), numerical_features.columns)
plt.colorbar()
plt.show()

In [None]:
fig = go.Figure()

fig.add_trace(go.Box(
    y=df['to_user_distance'],
    name="Suspected Outliers",
    boxpoints='suspectedoutliers', # only suspected outliers
    marker=dict(
        color='rgb(8,81,156)',
        outliercolor='rgba(219, 64, 82, 0.6)',
        line=dict(
            outliercolor='rgba(219, 64, 82, 0.6)',
            outlierwidth=2)),
    line_color='rgb(8,81,156)'
))


fig.update_layout(title_text="Box Plot Styling Outliers")
fig.show()

In [None]:
df['to_user_distance'].hist(bins=8) 
plt.xlabel("Duración en minutos")
plt.ylabel("Frecuencia")
plt.show()

In [None]:
fig = go.Figure()

fig.add_trace(go.Box(
    y=df['to_user_elevation'],
    name="Suspected Outliers",
    boxpoints='suspectedoutliers', # only suspected outliers
    marker=dict(
        color='rgb(8,81,156)',
        outliercolor='rgba(219, 64, 82, 0.6)',
        line=dict(
            outliercolor='rgba(219, 64, 82, 0.6)',
            outlierwidth=2)),
    line_color='rgb(8,81,156)'
))


fig.update_layout(title_text="Box Plot Styling Outliers")
fig.show()

In [None]:
df['to_user_elevation'].hist(bins=8) 
plt.xlabel("Duración en minutos")
plt.ylabel("Frecuencia")
plt.show()

In [None]:
fig = go.Figure()

fig.add_trace(go.Box(
    y=df['total_earning'],
    name="Suspected Outliers",
    boxpoints='suspectedoutliers', # only suspected outliers
    marker=dict(
        color='rgb(8,81,156)',
        outliercolor='rgba(219, 64, 82, 0.6)',
        line=dict(
            outliercolor='rgba(219, 64, 82, 0.6)',
            outlierwidth=2)),
    line_color='rgb(8,81,156)'
))


fig.update_layout(title_text="Box Plot Styling Outliers")
fig.show()

In [None]:
df['total_earning'].hist(bins=8) 
plt.xlabel("Duración en minutos")
plt.ylabel("Frecuencia")
plt.show()

In [None]:
len(df['store_id'].unique())

In [None]:
len(df['store_id'])

## Quiar outliers

In [7]:
data = df.copy()

In [8]:
data.isnull().sum()

order_id             0
store_id             0
to_user_distance     0
to_user_elevation    0
total_earning        0
created_at           0
taken                0
dates                0
weekday              0
dtype: int64

In [9]:
pearn = data['total_earning'].quantile(.97)
data = data.drop(data[data['total_earning']>pearn].index)

In [10]:
data.isnull().sum()

order_id             0
store_id             0
to_user_distance     0
to_user_elevation    0
total_earning        0
created_at           0
taken                0
dates                0
weekday              0
dtype: int64

## Preprocessing

In [None]:
onehotencoder = OneHotEncoder(drop='first')
x = onehotencoder.fit(data[['weekday']])
x = x.transform(data[['weekday']]).toarray()
clases = onehotencoder.get_feature_names()
x = pd.DataFrame(x)
x.columns = clases

In [None]:
data = data.reset_index(drop = True)
data = pd.concat([data, x], axis=1, sort=False)
data.isnull().sum()

In [11]:
def woe_values(data, feature, y):
    lst = []
    for i in range(data[feature].nunique()):
        val = list(data[feature].unique())[i]
        lst.append({
            'Value': val,
            'All': data[data[feature] == val].count()[feature],
            'Good': data[(data[feature] == val) & (data[y] == 1)].count()[feature],
            'Bad': data[(data[feature] == val) & (data[y] == 0)].count()[feature]
        })
        
    woe = pd.DataFrame(lst)
    woe['Distr_Good'] = woe['Good'] / woe['Good'].sum()
    woe['Distr_Bad'] = woe['Bad'] / woe['Bad'].sum()
    woe['WoE'] = np.log(woe['Distr_Good'] / woe['Distr_Bad'])
    woe = woe.replace({'WoE': {np.inf: 0, -np.inf: 0}})
    woe['IV'] = (woe['Distr_Good'] - woe['Distr_Bad']) * woe['WoE']
    iv = woe['IV'].sum()
    
    woe = woe.sort_values(by='WoE')
    
    return woe

In [12]:
stores=woe_values(data,'store_id','taken')
stores=stores.rename(columns ={"Value":"store", "IV":"iv_store"})
stores = stores.dropna()

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [None]:
from sklearn.metrics import silhouette_score
sil = []
kmax = 10

# dissimilarity would not be defined for a single cluster, thus, minimum number of clusters should be 2
for k in range(2, kmax+1):
    kmeans = KMeans(n_clusters = k).fit(np.array(stores['WoE']).reshape(-1,1))
    
    labels = kmeans.labels_
    sil.append(silhouette_score(np.array(stores['WoE']).reshape(-1,1), labels, metric = 'euclidean'))

plt.plot(sil)

In [13]:
kmeans = KMeans(n_clusters=6, random_state=0 )
stores['clusters'] = kmeans.fit_predict(stores.WoE.values.reshape(-1,1))   

In [14]:
cl = stores[['store','clusters']]
cl.set_index('store', drop=True, inplace=True)
dic = cl.to_dict(orient="dict")
final = dic['clusters']

In [15]:
data['store'] = data.store_id.map(final)
data.isnull().sum()

order_id             0
store_id             0
to_user_distance     0
to_user_elevation    0
total_earning        0
created_at           0
taken                0
dates                0
weekday              0
store                0
dtype: int64

In [None]:
todos = [0, 1, 2, 3, 4, 5, 6]
borrar = 0
for x in todos:
    if x in data['store'].unique():
        pass
    else:
        data = data.append(pd.DataFrame({'store':[x]}), ignore_index=True, sort=True)
        borrar = borrar + 1

In [None]:
onehotencoder = OneHotEncoder(drop='first')
x = onehotencoder.fit(data[['store']])
x = x.transform(data[['store']]).toarray()
clases = onehotencoder.get_feature_names()
nombres=[]
for n in clases:
    nombres.append(str(n)[3])
    
x = pd.DataFrame(x)
x.columns = nombres
nombres

In [None]:
data = pd.concat([data, x], axis=1, sort=False)

In [None]:
data.drop(data.tail(borrar).index,inplace=True)

In [None]:
data.isnull().sum()

In [None]:
data.columns

In [16]:
dff = data.copy()
dff = dff.drop(['created_at', 'dates', 'weekday', 'order_id', 'store_id'], axis=1)

In [17]:
import collections
balan = collections.Counter(dff['taken'])
porc = (balan[0]/(balan[0] + balan[1]))*100
print(balan)
print(porc)

Counter({1: 131459, 0: 11167})
7.829568241414608


In [18]:
dn = data.loc[:, ('order_id', 'store', 'to_user_distance', 'to_user_elevation', 'total_earning','weekday', 'taken')]

In [None]:
features_n = ['store', 'to_user_distance', 'to_user_elevation', 'total_earning', 'x0_Monday',
       'x0_Saturday', 'x0_Sunday', 'x0_Thursday', 'x0_Tuesday', 'x0_Wednesday',
       '1', '2', '3', '4', '5', '6']

In [19]:
features = ['store', 'to_user_distance', 'to_user_elevation', 'total_earning', 'weekday']

In [20]:
label ='taken'

In [21]:
train, test = train_test_split(dn, 
                               test_size = 0.25,
                               stratify = dn[label],
                               random_state = 29)

In [22]:
Counter(train[label])

Counter({1: 98594, 0: 8375})

In [23]:

over = RandomOverSampler(sampling_strategy=0.5)
X_over, y_over = over.fit_resample(train[features], train[label])
print(Counter(y_over))

Counter({1: 98594, 0: 49297})


In [24]:
under = RandomUnderSampler(sampling_strategy=0.8)
X_combined, y_combined = under.fit_resample(X_over, y_over)
print(Counter(y_combined))

Counter({1: 61621, 0: 49297})


In [None]:
model_pipeline = Pipeline_imb([
    ('class_balancer', SMOTE(random_state=1)),
    ('classifier', RandomForestClassifier(random_state=1, n_estimators=600))
])

In [None]:
model_pipeline = Pipeline_imb([
    ('classifier', RandomForestClassifier(random_state=1, max_depth=11, n_estimators=500))
])

In [None]:
model_pipeline.fit(train[features], train[label])

In [None]:
rf_predict_val = model_pipeline.predict_proba(test[features])

In [25]:
def specificity(y_real, y_predicted):
    tn = pd.np.sum(((y_real == y_predicted) & (y_real == 0)) * 1)
    fp = pd.np.sum((y_real != y_predicted) & (y_predicted == 1) * 1)
    return  tn / (tn + fp)

In [26]:
def tiny_score(test_predictions, test, label, threshold):
    predictions = test_predictions[:,1] >= threshold 
    
    return {
        'specificity': specificity(test[label], predictions),
        'accuracy': accuracy_score(test[label], predictions),
        'precision': precision_score(test[label], predictions),
        'recall': recall_score(test[label],predictions),
        'roc_auc': roc_auc_score(test[label], test_predictions[:,1]),
        'false_positives': confusion_matrix(test[label], predictions).ravel()[1],
        'true_positives': confusion_matrix(test[label], predictions).ravel()[3],
        'false_negatives': confusion_matrix(test[label], predictions).ravel()[2],
        'true_negatives': confusion_matrix(test[label], predictions).ravel()[0]
    }

In [None]:
tiny_score(rf_predict_val, test, label, 0.5)

In [None]:
clf = RandomForestClassifier(random_state=1)
param_grid = {'classifier__n_estimators':np.arange(200, 600, 100)}
grid_search = GridSearchCV(model_pipeline,
                           param_grid=param_grid,
                           n_jobs=5,
                           cv = 3,
                           verbose=2)
grid_search.fit(train[features], train[label])

In [None]:
grid_search.best_params_

In [27]:
model_pipeline_xg = Pipeline_imb([
    ('classifier', XGBClassifier(subsample = 0.8,
                        n_estimators = 300,
                        min_child_weight = 5,
                        max_depth = 5,
                        learning_rate = 0.1,
                        gamma = 0.5,
                        colsample_bytree = 0.6,
                        objective = 'binary:logistic'))
])

In [28]:
categorical_columns = ['store', 'weekday']
numerical_columns = ['to_user_distance', 'to_user_elevation', 'total_earning']
numerical_transformer = Pipeline([('imputer', SimpleImputer(strategy='median'))])
categorical_transformer = Pipeline(steps = [('onehot', OneHotEncoder(handle_unknown='ignore'))])
column_transformer = ColumnTransformer([('numerical_transformer', numerical_transformer, numerical_columns), ('categorical_transformer', categorical_transformer, categorical_columns)])
preprocessor = Pipeline([('preprocessor', column_transformer)])

In [29]:
prueba=preprocessor.fit_transform(train[features], train[label])

In [30]:
teste = preprocessor.transform(test[features])

In [31]:
model_pipeline_xg.fit(prueba, train[label])
rf_predict_val = model_pipeline_xg.predict_proba(teste)

In [32]:
tiny_score(rf_predict_val, test, label, 0.5)

{'specificity': 0.02363896848137536,
 'accuracy': 0.9215862243037832,
 'precision': 0.9232566650713663,
 'recall': 0.9978700745473909,
 'roc_auc': 0.7606980856826375,
 'false_positives': 2726,
 'true_positives': 32795,
 'false_negatives': 70,
 'true_negatives': 66}

In [35]:
model = {'stores':final,
        'preprocessing':preprocessor,
        'model':model_pipeline_xg}

In [36]:
with open('modelTest.pickle', 'wb') as file:
    pickle.dump(model, file)

In [None]:
param_grid = {'classifier__n_estimators':np.arange(200, 600, 100),
             'classifier__subsample':np.arange(0.5, 1, 0.2),
             'classifier__min_child_weight':np.arange(5,10),
             'classifier__max_depth':np.arange(5,10),
             'classifier__learning_rate':np.arange(0.1,0.5),
             'classifier__gamma':np.arange(0.5,1),
             'classifier__colsample_bytree':np.arange(0.5,1)}
grid_search = GridSearchCV(model_pipeline_xg,
                           param_grid=param_grid,
                           n_jobs=5,
                           cv = 3,
                           verbose=2)
grid_search.fit(train[features], train[label])
grid_search.best_params_