In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, Perceptron, RidgeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings('ignore')

# import datasets
currency_df = pd.read_csv('currency_conversion_rates.csv')
training_df = pd.read_csv('Prediction Challenge Training Data_Updated.csv')
testing_df = pd.read_csv('Prediction Challenge Testing Data_Updated.csv')

# use regex to extract currency and amount from prices in training_df, and convert numbers to floats
training_df['discount_price_currency'] = training_df['discount_price'].str.extract(r'([A-Za-z]+)', expand=False)
training_df['discount_price_amount'] = training_df['discount_price'].str.extract(r'([\d.,]+)', expand=False)
training_df['discount_price_amount'] = training_df['discount_price_amount'].str.replace(',', '').astype(float)

training_df['actual_price_currency'] = training_df['actual_price'].str.extract(r'([A-Za-z]+)', expand=False)
training_df['actual_price_amount'] = training_df['actual_price'].str.extract(r'([\d.,]+)', expand=False)
training_df['actual_price_amount'] = training_df['actual_price_amount'].str.replace(',', '').astype(float)

# same process for testing_df
testing_df['discount_price_currency'] = testing_df['discount_price'].str.extract(r'([A-Za-z]+)', expand=False)
testing_df['discount_price_amount'] = testing_df['discount_price'].str.extract(r'([\d.,]+)', expand=False)
testing_df['discount_price_amount'] = testing_df['discount_price_amount'].str.replace(',', '').astype(float)

testing_df['actual_price_currency'] = testing_df['actual_price'].str.extract(r'([A-Za-z]+)', expand=False)
testing_df['actual_price_amount'] = testing_df['actual_price'].str.extract(r'([\d.,]+)', expand=False)
testing_df['actual_price_amount'] = testing_df['actual_price_amount'].str.replace(',', '').astype(float)

# calculate conversion rates by dividing 100 over the USD conversion rates
currency_df['conversion_rate'] = 100 / currency_df['100_USD_worth']

# create a dictionary that maps each currency code to its corresponding conversion rate
currency_rates = dict(zip(currency_df['Currency_Code'], currency_df['conversion_rate'])) 

# convert all prices in training_df to USD, and remove the unneeded conversion column when done
training_df['conversion_rate'] = training_df['discount_price_currency'].map(currency_rates)
training_df['discount_price_usd'] = training_df['discount_price_amount'] * training_df['conversion_rate']
training_df.drop(['conversion_rate'], axis=1, inplace=True)

training_df['conversion_rate'] = training_df['actual_price_currency'].map(currency_rates)
training_df['actual_price_usd'] = training_df['actual_price_amount'] * training_df['conversion_rate']
training_df.drop(['conversion_rate'], axis=1, inplace=True)

# same process for testing_df
testing_df['conversion_rate'] = testing_df['discount_price_currency'].map(currency_rates)
testing_df['discount_price_usd'] = testing_df['discount_price_amount'] * testing_df['conversion_rate']
testing_df.drop(['conversion_rate'], axis=1, inplace=True)

testing_df['conversion_rate'] = testing_df['actual_price_currency'].map(currency_rates)
testing_df['actual_price_usd'] = testing_df['actual_price_amount'] * testing_df['conversion_rate']
testing_df.drop(['conversion_rate'], axis=1, inplace=True)

# handle missing values (more info in project report)
training_df['ratings'] = training_df['ratings'].fillna(training_df['ratings'].median())
training_df['no_of_ratings'] = training_df['no_of_ratings'].fillna(training_df['no_of_ratings'].median())
training_df['main_category'] = training_df['main_category'].fillna('Unknown')
training_df['sub_category'] = training_df['sub_category'].fillna('Unknown')

testing_df['ratings'] = testing_df['ratings'].fillna(testing_df['ratings'].median())
testing_df['no_of_ratings'] = testing_df['no_of_ratings'].fillna(testing_df['no_of_ratings'].median())
testing_df['main_category'] = testing_df['main_category'].fillna('Unknown')
testing_df['sub_category'] = testing_df['sub_category'].fillna('Unknown')

# calculate discount ratios (more info in project report)
training_df['discount_percentage'] = ((training_df['actual_price_usd'] - training_df['discount_price_usd']) / training_df['actual_price_usd']) * 100
training_df['discount_amount'] = training_df['actual_price_usd'] - training_df['discount_price_usd']
testing_df['discount_percentage'] = ((testing_df['actual_price_usd'] - testing_df['discount_price_usd']) / testing_df['actual_price_usd']) * 100
testing_df['discount_amount'] = testing_df['actual_price_usd'] - testing_df['discount_price_usd']

# calculate weighted ratings using the formula: x + a * y^2
a = 0.1
training_df['weighted_rating'] = training_df['ratings'] + a * (training_df['no_of_ratings'] ** 2)
testing_df['weighted_rating'] = testing_df['ratings'] + a * (testing_df['no_of_ratings'] ** 2)

# calculate price after discount
training_df['price_after_discount'] = training_df['discount_price_usd']
testing_df['price_after_discount'] = testing_df['discount_price_usd']

# simple price range categories for low/medium/high
def categorize_price(row):
    if row['price_after_discount'] < 20:
        return 'Low'
    elif row['price_after_discount'] < 100:
        return 'Medium'
    else:
        return 'High'
        
training_df['price_range'] = training_df.apply(categorize_price, axis=1)
testing_df['price_range'] = testing_df.apply(categorize_price, axis=1)

# combine the data to encode categorical variables
combined_df = pd.concat([training_df, testing_df], sort=False)

# create label encoders for each categorical feature
label_encoder_main = LabelEncoder()
label_encoder_sub = LabelEncoder()
label_encoder_price_range = LabelEncoder()

# encode the main categories into numerical labels
combined_df['main_category_encoded'] = label_encoder_main.fit_transform(combined_df['main_category'])
combined_df['sub_category_encoded'] = label_encoder_sub.fit_transform(combined_df['sub_category'])

# encode the price ranges into labels where Low = 0, Medium = 1, High = 2
combined_df['price_range_encoded'] = label_encoder_price_range.fit_transform(combined_df['price_range'])

# split the combined data back into training and testing sets based on the 'purchase?' column
training_df = combined_df.loc[combined_df['purchase?'].notnull()]  # rows where 'purchase?' is not null
testing_df = combined_df.loc[combined_df['purchase?'].isnull()]    # rows where 'purchase?' is null

# create a final list of features to be used by the models
features = [
    'discount_percentage',      # percentage discount offered
    'discount_amount',          # absolute discount amount in USD
    'weighted_rating',          # product rating weighted by number of ratings
    'main_category_encoded',    # encoded main category of the product
    'sub_category_encoded',     # encoded sub-category of the product
    'price_range_encoded'       # encoded price range after discount
]

# feature matrices for the training (X) and testing data (X_test)
X = training_df[features]
X_test = testing_df[features]

# target vector y where purchase = 1 and no purchase = 0
y = training_df['purchase?'].map({'YES': 1, 'NO': 0})

# fill missing values with the median value of each column
X = X.fillna(X.median())
X_test = X_test.fillna(X_test.median())

# split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=100  # 20% of the data will be used for validation
)

# fill missing values in the new datasets
X_train = X_train.fillna(X_train.median())
X_val = X_val.fillna(X_val.median())

# initialize all the models to be ran, use an arbitrary random_state
models = {
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Perceptron': Perceptron(max_iter=1000, random_state=100),
    'Ridge Classifier': RidgeClassifier(random_state=100),
    'Linear Discriminant Analysis': LinearDiscriminantAnalysis(),
    'Quadratic Discriminant Analysis': QuadraticDiscriminantAnalysis(),
    'Decision Tree': DecisionTreeClassifier(random_state=100),
    'Random Forest': RandomForestClassifier(random_state=100),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

# hyperparameter tuning for Random Forest (more explanation in project report)
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'class_weight': ['balanced']
}

# perform a grid search to find the best hyperparameters for Random Forest
rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=100), rf_params, cv=3, scoring='f1', n_jobs=-1)
rf_grid_search.fit(X_train, y_train)
best_rf_model = rf_grid_search.best_estimator_
models['Random Forest'] = best_rf_model

# hyperparameter tuning for Decision Tree
dt_params = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'class_weight': ['balanced']
}

# perform a grid search to find the best hyperparameters for Decision Tree
dt_grid_search = GridSearchCV(DecisionTreeClassifier(random_state=100), dt_params, cv=3, scoring='f1', n_jobs=-1)
dt_grid_search.fit(X_train, y_train)
best_dt_model = dt_grid_search.best_estimator_
models['Decision Tree'] = best_dt_model

model_predictions = {}

# train and evaluate each model
for model_name, model in models.items():
    
    # train the model
    model.fit(X_train, y_train)
    
    # predict on validation set
    y_pred = model.predict(X_val)
    
    # evaluate the model
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    
    # print model performance
    print(f'Model: {model_name}')
    print(f'Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}')
    print('-' * 50)
    
    # make predictions on testing data
    y_test_pred = model.predict(X_test)
    testing_df['purchase?'] = y_test_pred
    testing_df['purchase?'] = testing_df['purchase?'].map({1: 'YES', 0: 'NO'})
    
    # save the prediction file
    prediction_df = testing_df[['item_id', 'purchase?']]
    
    # save to a CSV file
    # prediction_filename = f'prediction_{model_name.replace(" ", "_")}.csv'
    # prediction_df.to_csv(prediction_filename, index=False)
    
    # store all prediction and performance metrics
    model_predictions[model_name] = {
        'y_val_pred': y_pred,
        'y_test_pred': y_test_pred,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }

# define a simple rule-based prediction model (freestyle)
def simple_rule(df):
    conditions = [
        (df['discount_percentage'] > 20) & (df['weighted_rating'] > 10)  # if discount > 20% and weighted rating > 10, predict YES
    ]
    choices = [1]
    df['purchase_pred'] = np.select(conditions, choices, default=0)  # else predict NO
    return df

# apply the simple model to validation data
X_val_simple = X_val.copy()
X_val_simple = simple_rule(X_val_simple)
y_pred_simple = X_val_simple['purchase_pred']

# evaluate and print simple model performance
accuracy_simple = accuracy_score(y_val, y_pred_simple)
precision_simple = precision_score(y_val, y_pred_simple)
recall_simple = recall_score(y_val, y_pred_simple)
f1_simple = f1_score(y_val, y_pred_simple)
print(f'Simple Freestyle Prediction Model')
print(f'Accuracy: {accuracy_simple:.4f}, Precision: {precision_simple:.4f}, Recall: {recall_simple:.4f}, F1 Score: {f1_simple:.4f}')
print('-' * 50)


Model: Naive Bayes
Accuracy: 0.8061, Precision: 0.5752, Recall: 0.0042, F1 Score: 0.0083
--------------------------------------------------
Model: Logistic Regression
Accuracy: 0.8058, Precision: 0.4091, Recall: 0.0006, F1 Score: 0.0012
--------------------------------------------------
Model: Perceptron
Accuracy: 0.1930, Precision: 0.1923, Recall: 0.9868, F1 Score: 0.3219
--------------------------------------------------
Model: Ridge Classifier
Accuracy: 0.8059, Precision: 0.5122, Recall: 0.0014, F1 Score: 0.0027
--------------------------------------------------
Model: Linear Discriminant Analysis
Accuracy: 0.8052, Precision: 0.4472, Recall: 0.0161, F1 Score: 0.0311
--------------------------------------------------
Model: Quadratic Discriminant Analysis
Accuracy: 0.8053, Precision: 0.4350, Recall: 0.0106, F1 Score: 0.0206
--------------------------------------------------
Model: Decision Tree
Accuracy: 0.8279, Precision: 0.5412, Recall: 0.7427, F1 Score: 0.6262
--------------------