# Supervised

In [24]:
import pandas as pd

df = pd.read_csv('temp/olist_orders_dataset_df.csv')

In [None]:
X = df.drop(['anomaly'], axis=1)
y = df['anomaly']

# split X y to train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

from sklearn.utils import resample, shuffle

df_train = pd.concat([X_train, y_train], axis=1)
df_train_True = df_train[df_train.anomaly]
df_train_False = df_train[df_train.anomaly==False]

df_train_False = resample(df_train_False, replace=True, n_samples=df_train_True.shape[0])

df_train_balanced = shuffle(pd.concat([df_train_True, df_train_False]))
X_train = df_train_balanced.drop(['anomaly'], axis=1)
y_train = df_train_balanced['anomaly']

### Create a Table of performance

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

table_of_performance = pd.DataFrame(columns=['algorithm', 'confusion matrix', 'accuracy', 'precision', 'recall'])

def add_performance(algorithm, y_test, y_predict):
    global table_of_performance

    cm = confusion_matrix(y_test, y_predict)
    accuracy = accuracy_score(y_test, y_predict)
    precision = precision_score(y_test, y_predict)
    recall = recall_score(y_test, y_predict)
    new_row = {'algorithm': algorithm, 'confusion matrix': cm, 'accuracy': accuracy, 'precision': precision, 'recall': recall}

    table_of_performance.loc[len(table_of_performance)] = new_row

### XGBoost

In [None]:
from xgboost import XGBClassifier
XGB = XGBClassifier()
XGB.fit(X_train, y_train)
y_predict = XGB.predict(X_test)

add_performance('XGBoost', y_test, y_predict)

In [None]:
import matplotlib.pyplot as plt

feature_importances = pd.Series(XGB.feature_importances_, index = X.columns)

feature_importances_plot = feature_importances.plot(kind='bar', title='feature importances')
fig = feature_importances_plot.get_figure()
plt.show()

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
LR.fit(X_train, y_train)
y_predict = LR.predict(X_test)

add_performance('logistic regression', y_test, y_predict)

In [None]:
feature_importances = pd.Series(LR.coef_[0], index = X.columns)

feature_importances_plot = feature_importances.plot(kind='bar', title='feature importances')
fig = feature_importances_plot.get_figure()
plt.show()

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier()
RF.fit(X_train, y_train)
y_predict = RF.predict(X_test)

add_performance('random forest', y_test, y_predict)

In [None]:
feature_importances = pd.Series(RF.feature_importances_, index = X.columns)

feature_importances_plot = feature_importances.plot(kind='bar', title='feature importances')
fig = feature_importances_plot.get_figure()
plt.show()

### Testing combinations of features

In [None]:
columns1 = ['time_estimate_delivery', 'year', 'month', 'day', 'freight_value', 'seller_zip_code_prefix', 'seller_geolocation_lat', 'seller_geolocation_lng','customer_zip_code_prefix', 'customer_geolocation_lat', 'customer_geolocation_lng', 'distance']

In [None]:
X1_train = X_train[columns1]
X1_test = X_test[columns1]

In [None]:
XGB = XGBClassifier()
XGB.fit(X1_train, y_train)
y_predict = XGB.predict(X1_test)

print('confusion matrix:', confusion_matrix(y_test, y_predict))
print('accuracy:', accuracy_score(y_test, y_predict))
print('precision:', precision_score(y_test, y_predict))
print('recall:', recall_score(y_test, y_predict))

feature_importances = pd.Series(XGB.feature_importances_, index = columns1)

feature_importances_plot = feature_importances.plot(kind='bar', title='feature importances')
fig = feature_importances_plot.get_figure()
plt.show()

In [None]:
LR = LogisticRegression()
LR.fit(X1_train, y_train)
y_predict = LR.predict(X1_test)

print('confusion matrix:', confusion_matrix(y_test, y_predict))
print('accuracy:', accuracy_score(y_test, y_predict))
print('precision:', precision_score(y_test, y_predict))
print('recall:', recall_score(y_test, y_predict))

feature_importances = pd.Series(LR.coef_[0], index = columns1)

feature_importances_plot = feature_importances.plot(kind='bar', title='feature importances')
fig = feature_importances_plot.get_figure()
plt.show()

In [None]:
RF = RandomForestClassifier()
RF.fit(X1_train, y_train)
y_predict = RF.predict(X1_test)

print('confusion matrix:', confusion_matrix(y_test, y_predict))
print('accuracy:', accuracy_score(y_test, y_predict))
print('precision:', precision_score(y_test, y_predict))
print('recall:', recall_score(y_test, y_predict))

feature_importances = pd.Series(RF.feature_importances_, index = columns1)

feature_importances_plot = feature_importances.plot(kind='bar', title='feature importances')
fig = feature_importances_plot.get_figure()
plt.show()

By analyzing feature importances and trying, I find the features which are important

• time_estimate_delivery
• distance
• freight_value

• year
• month
• day

• seller_zip_code_prefix
• seller_geolocation_lat
• seller_geolocation_lng
• customer_zip_code_prefix
• customer_geolocation_lat
• customer_geolocation_lng

The low precision value is because the test dataset is imbalanced while the train dataset is balanced by me. However, if not using balanced dataset, the recall rate is low. I think recall is more important than precision.