In [None]:
# Importing dependecies:
import numpy as np
import pandas as pd
# from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')
import os
import warnings
warnings.filterwarnings('ignore')

# Configuring Notebook environment:
sns.set()
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

plt.rcParams['figure.figsize'] = (15, 7.5)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import scale
from sklearn.model_selection import RandomizedSearchCV

# 6.0 Machine Learning Processing:

## 6.0.1 Pre-processing data

In [None]:
df_clean = pd.read_csv('df_clean.csv')
df_clean.head()

In [None]:
df_clean.info()

In [None]:
# Narrowing the columns for faster output:
df_training = df_clean[['product_id', 'store_id', 'display', 'mailer', 'sales_value', 'description', 'manufacturer','department', 'brand', 'income_desc', 'homeowner_desc']]
# df_training = df_clean[['sales_value', 'product_id', 'store_id', 'week_no', 'retail_disc', 'age_desc', 'marital_status_code', 'income_desc']]
df_training = pd.get_dummies(df_training)
df_training.head()

In [None]:
df_training.info()

## 6.0.2 Model formula:

In [None]:
def train_test_model(X, y, model, params, test_size=.2, random_state=42):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    model_cv = GridSearchCV(model, param_grid=params, cv=5)
    
    model_cv.fit(X_train, y_train)
    
    y_pred = model_cv.predict(X_test)
    
    y_pred_prob = model_cv.predict_proba(X_test)[:,1]
    
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

    # Plot ROC curve
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % auc(fpr,tpr))
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.show();
    
    # Print the optimal parameters and best score
    print("Tuned Hyperparameter(s): {}".format(model_cv.best_params_))
    print("Tuned Accuracy Score: {}".format(model_cv.best_score_))
    print(classification_report(y_test, y_pred))

## 6.0.3 Creating/Splitting the data:

In [None]:
df_training['sales_value'] = df_training['sales_value'].astype(int)
df_training['sales_value'].dtypes

In [None]:
# Calculating a simple linear regression, even though it is not the most effective way to train the dataset on:
X = df_training.drop(['sales_value'], axis=1).values
y = df_training['sales_value'].values
print(X.shape, y.shape)

## 6.1.0 Linear Models:

### 6.1.1 Linear Regression - To predict product prices

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=42, shuffle=True)

In [None]:
print(X.shape[0])
print(float(X_train.shape[0]) / float(X.shape[0]))
print(float(X_test.shape[0]) / float(X.shape[0]))

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
model.score(X_test, y_test)
# preds = model.predict(X_train)
# print('Train accuracy score:',accuracy_score(y_train, preds))
# print('Test accuracy score:', accuracy_score(y_test,logreg.predict(X_test))) 

In [None]:
for X, y in zip(X_test[:50], y_test[:50]):
    print(f"Model: {model.predict([X])[0]}, Actual: {y}, Percentile: {round((model.predict([X])[0]/y)*100.00, 2)}%")

## 6.2 Classification

### 6.2.1 Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc_b = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=100, verbose=True)
rfc_b.fit(X_train, y_train)
y_pred = rfc_b.predict(X_train)
print('Train accuracy score:',accuracy_score(y_train,y_pred))
print('Test accuracy score:', accuracy_score(y_test,rfc_b.predict(X_test)))

In [None]:
for X, y in zip(X_test[:100], y_test[:100]):
    print(f"Model: {rfc_b.predict([X])[0]}, Actual: {y}, Percentile: {round((rfc_b.predict([X])[0]/y)*100.00, 2)}%")

### 6.2.2 Random Forest Regressos

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rfc_b = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=100, verbose=True)
rfc_b.fit(X_train, y_train)
y_pred = rfc_b.predict(X_train)
print('Train accuracy score:',accuracy_score(y_train,y_pred))
print('Test accuracy score:', accuracy_score(y_test,rfc_b.predict(X_test)))

In [None]:
for X, y in zip(X_test[:100], y_test[:100]):
    print(f"Model: {rfc_b.predict([X])[0]}, Actual: {y}, Percentile: {round((rfc_b.predict([X])[0]/y)*100.00, 2)}%")

## 6.2.2 Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
gnb.score(X_test, y_test)

In [None]:
for X, y in zip(X_test[:100], y_test[:100]):
    print(f"Model: {gnb.predict([X])[0]}, Actual: {y}, Percentile: {round((gnb.predict([X])[0]/y)*100.00, 2)}%")

## 6.3 Clustering

### 6.3.1 KMeans

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(n_clusters=4, random_state=0, verbose=0)
kmeans.fit(X_train, y_train)
kmeans.score(X_test, y_test)

In [None]:
for X, y in zip(X_test[:100], y_test[:100]):
    print(f"Model: {kmeans.predict([X])[0]}, Actual: {y}, Percentile: {round((kmeans.predict([X])[0]/y)*100.00, 2)}%")

### 6.3.2 Spectral Clustering

In [None]:
from sklearn.cluster import SpectralClustering

In [None]:
clustering = SpectralClustering(n_clusters=2, assign_labels="discretize", random_state=0)
clustering.fit(X_train, y_train)
clustering.score(X_test, y_test)

In [None]:
for X, y in zip(X_test[:100], y_test[:100]):
    print(f"Model: {clustering.predict([X])[0]}, Actual: {y}, Percentile: {round((clustering.predict([X])[0]/y)*100.00, 2)}%")

### 6.3.3 GMM

In [None]:
from sklearn.mixture import GMM