# ML Product Recommendation
Note to self, using venv in examples folder on laptop
## 1. Imports and Loading the Data

In [112]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

data = pd.read_csv('user_personalized_features.csv')

# Separate dataset into feature matrix and target vector
X = data.drop(['Product_Category_Preference', 'Unnamed: 0', 'User_ID'], axis=1)    # Axis 0 to drop rows, 1 for cols
y = data['Product_Category_Preference']
print(f'Feature Matrix:\n{X.head()}\n')
print(f'Target Vector:\n{y.head()}')

Feature Matrix:
   Age  Gender  Location  Income   Interests  Last_Login_Days_Ago  \
0   56    Male  Suburban   38037      Sports                    5   
1   46  Female     Rural  103986  Technology                   15   
2   32  Female  Suburban  101942      Sports                   28   
3   60  Female  Suburban   71612     Fashion                   18   
4   25    Male  Suburban   49725      Travel                    2   

   Purchase_Frequency  Average_Order_Value  Total_Spending  \
0                   7                   18            2546   
1                   7                  118             320   
2                   1                  146            3766   
3                   3                  163            4377   
4                   5                  141            4502   

   Time_Spent_on_Site_Minutes  Pages_Viewed  Newsletter_Subscription  
0                         584            38                     True  
1                         432            40           

## 2. Preprocessing

In [113]:
# Check missing values
print(X.isna().sum())
print()
print(y.isna().sum())

# Print data types to see if we have to apply encoding
print(X.dtypes)
print(y.dtype)

# Print data scales to see if we have to apply Scaler
print(X.describe())

Age                           0
Gender                        0
Location                      0
Income                        0
Interests                     0
Last_Login_Days_Ago           0
Purchase_Frequency            0
Average_Order_Value           0
Total_Spending                0
Time_Spent_on_Site_Minutes    0
Pages_Viewed                  0
Newsletter_Subscription       0
dtype: int64

0
Age                            int64
Gender                        object
Location                      object
Income                         int64
Interests                     object
Last_Login_Days_Ago            int64
Purchase_Frequency             int64
Average_Order_Value            int64
Total_Spending                 int64
Time_Spent_on_Site_Minutes     int64
Pages_Viewed                   int64
Newsletter_Subscription         bool
dtype: object
object
               Age         Income  Last_Login_Days_Ago  Purchase_Frequency  \
count  1000.000000    1000.000000          1000.000000   

In [114]:
# Split the dataset into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)

print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}\n')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')


X_train shape: (900, 12)
y_train shape: (900,)

X_test shape: (100, 12)
y_test shape: (100,)


## 3. Model Training

In [115]:
# Make pipeline to apply scaling and encoding
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

# Scaling and encoding
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['bool', 'object']).columns.tolist()

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

pca = PCA(n_components=0.90)

# Pipelines implementing the preprocessing and models
logistic_regression_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('pca', pca),
    ('model', LogisticRegression(max_iter=1000)),

])

random_forest_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('pca', pca),
    ('model', RandomForestClassifier(n_estimators=100))
])

gradient_boosting_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('pca', pca),
    ('model', GradientBoostingClassifier(n_estimators=100))
])

svc_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('pca', pca),
    ('model', SVC())
])

knn_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('pca', pca),
    ('model', KNeighborsClassifier())
])

mlp_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('pca', pca),
    ('model', MLPClassifier(max_iter=700))
])

In [116]:
# Training the models
logistic_regression_pipeline.fit(X_train, y_train)
random_forest_pipeline.fit(X_train, y_train)
gradient_boosting_pipeline.fit(X_train, y_train)
svc_pipeline.fit(X_train, y_train)
knn_pipeline.fit(X_train, y_train)
mlp_pipeline.fit(X_train, y_train)

# Evaluating models
logistic_score = logistic_regression_pipeline.score(X_test, y_test)
random_forest_score = random_forest_pipeline.score(X_test, y_test)
gradient_boosting_score = gradient_boosting_pipeline.score(X_test, y_test)
svc_score = svc_pipeline.score(X_test, y_test)
knn_score = knn_pipeline.score(X_test, y_test)
mlp_score = mlp_pipeline.score(X_test, y_test)

print(f'logistic regression accuracy score (pre-GridSearch): {logistic_score:.4f}')
print(f'random forest accuracy score (pre-GridSearch): {random_forest_score:.4f}')
print(f'gradient boosting accuracy score (pre-GridSearch): {gradient_boosting_score:.4f}')
print(f'SVC accuracy score (pre-GridSearch): {svc_score:.4f}')
print(f'KNN accuracy score (pre-GridSearch): {knn_score:.4f}')
print(f'MLP accuracy score (pre-GridSearch): {mlp_score:.4f}')

logistic regression accuracy score (pre-GridSearch): 0.2700
random forest accuracy score (pre-GridSearch): 0.2900
gradient boosting accuracy score (pre-GridSearch): 0.2600
SVC accuracy score (pre-GridSearch): 0.1700
KNN accuracy score (pre-GridSearch): 0.2600
MLP accuracy score (pre-GridSearch): 0.2200




In [117]:
# Optimizing the models to improve performance (Takes about 2 mins)
from sklearn.model_selection import GridSearchCV

# Make grids with hyperparameters to change for each model
logistic_regression_grid = {
    'model__C': [0.1, 1, 10, 100],  # changing regularization strength (< is stronger)
    'model__solver': ['liblinear', 'lbfgs']
}

random_forest_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10], 
    'model__min_samples_leaf': [1, 2, 4]
}

gradient_boosting_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__learning_rate': [0.001, 0.1, 0.2],
    'model__max_depth': [3, 5, 7]
}

svc_grid = {
    'model__C': [0.1, 1, 10, 100],
    'model__kernel': ['linear', 'poly', 'rbf'],
    'model__gamma': ['scale', 'auto']
}

knn_grid = {
    'model__n_neighbors': [3, 5, 7, 9, 11],
    'model__weights': ['uniform', 'distance'],
    'model__metric': ['euclidean', 'manhattan']
}

mlp_grid = {
    'model__hidden_layer_sizes': [(50,), (100,), (100, 50)],
    'model__activation': ['relu', 'tanh'],
    'model__learning_rate': ['constant', 'adaptive'],
    'model__alpha': [0.0001, 0.001, 0.01],
    'model__max_iter': [200, 500, 1000]
}

# Create the GridSearchCV for each model
logistic_search = GridSearchCV(
    logistic_regression_pipeline,
    logistic_regression_grid,
    cv=5,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

random_forest_search = GridSearchCV(
    random_forest_pipeline,
    random_forest_grid,
    cv=5,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

gradient_boosting_search = GridSearchCV(
    gradient_boosting_pipeline,
    gradient_boosting_grid,
    cv=5,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

svc_search = GridSearchCV(
    svc_pipeline,
    svc_grid,
    cv=5,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

knn_search = GridSearchCV(
    knn_pipeline,
    knn_grid,
    cv=5,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

mlp_search = GridSearchCV(
    mlp_pipeline,
    mlp_grid,
    cv=5,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)


# Fitting the models using the GridSearchCV
logistic_search.fit(X_train, y_train)
random_forest_search.fit(X_train, y_train)
gradient_boosting_search.fit(X_train, y_train)
svc_search.fit(X_train, y_train)
knn_search.fit(X_train, y_train)
mlp_search.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 108 candidates, totalling 540 fits




In [118]:
# Display best hyperparameters and scores from the GridSearchCV models
print('Logistic Regression best score:', logistic_search.best_score_)
print('Logistic Regression best params:', logistic_search.best_params_)

print('\nRandom Forest best score:', random_forest_search.best_score_)
print('Random Forest best params:', random_forest_search.best_params_)

print('\nGradient Boosting best score:', gradient_boosting_search.best_score_)
print('Gradient Boosting best params:', gradient_boosting_search.best_params_)

print('\nSVC best score:', svc_search.best_score_)
print('SVC best params:', svc_search.best_params_)

print('\nKNN best score:', knn_search.best_score_)
print('KNN best params:', knn_search.best_params_)

print('\nMLP best score:', mlp_search.best_score_)
print('MLP best params:', mlp_search.best_params_)

Logistic Regression best score: 0.2111111111111111
Logistic Regression best params: {'model__C': 10, 'model__solver': 'lbfgs'}

Random Forest best score: 0.22666666666666666
Random Forest best params: {'model__max_depth': 10, 'model__min_samples_leaf': 4, 'model__min_samples_split': 10, 'model__n_estimators': 50}

Gradient Boosting best score: 0.22111111111111112
Gradient Boosting best params: {'model__learning_rate': 0.001, 'model__max_depth': 5, 'model__n_estimators': 50}

SVC best score: 0.2177777777777778
SVC best params: {'model__C': 0.1, 'model__gamma': 'scale', 'model__kernel': 'rbf'}

KNN best score: 0.21222222222222223
KNN best params: {'model__metric': 'manhattan', 'model__n_neighbors': 7, 'model__weights': 'uniform'}

MLP best score: 0.2177777777777778
MLP best params: {'model__activation': 'relu', 'model__alpha': 0.01, 'model__hidden_layer_sizes': (100,), 'model__learning_rate': 'adaptive', 'model__max_iter': 1000}


## 4. Model Validation

In [119]:
from sklearn.metrics import accuracy_score

# Predicting with best estimators found using GridSearch
y_pred_lr = logistic_search.best_estimator_.predict(X_test)
y_pred_rf = random_forest_search.best_estimator_.predict(X_test)
y_pred_gb = gradient_boosting_search.best_estimator_.predict(X_test)
y_pred_svc = svc_search.best_estimator_.predict(X_test)
y_pred_knn = knn_search.best_estimator_.predict(X_test)
y_pred_mlp = mlp_search.best_estimator_.predict(X_test)

# Evaluating accuracy of best models found from the GridSearch
print("Logistic Regression Test Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Random Forest Test Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Gradient Boosting Test Accuracy:", accuracy_score(y_test, y_pred_gb))
print("SVC Test Accuracy:", accuracy_score(y_test, y_pred_svc))
print("KNN Test Accuracy:", accuracy_score(y_test, y_pred_knn))
print("MLP Accuracy:", accuracy_score(y_test, y_pred_mlp))


Logistic Regression Test Accuracy: 0.27
Random Forest Test Accuracy: 0.22
Gradient Boosting Test Accuracy: 0.21
SVC Test Accuracy: 0.22
KNN Test Accuracy: 0.29
MLP Accuracy: 0.18
