In [None]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.feature_selection import chi2, SelectKBest, SelectFromModel

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
%run TextPreprocessing.ipynb
%run HelperFunctions.ipynb

In [None]:
# Load datasets
train_df = pd.read_csv(r"book_rating_train.csv", index_col = False, delimiter = ',', header=0)
test_df = pd.read_csv(r"book_rating_test.csv", index_col = False, delimiter = ',', header=0)
entire_df = pd.concat([train_df, test_df])

In [None]:
DATASET_DIR = "./datasets"
CLASS_LABEL = "rating_label"

# 1. Data Preprocessing

Note: Here we show the individual preprocessing steps for the training set for clarity. At the end we pull everything together to preprocess the test set.

## Preprocess String features: Authors and Publisher

In [None]:
CATEGORICAL_FEATURES = ['Authors', 'Publisher']
MISSING_CAT_VAL = ''

In [None]:
train_df = train_df.drop('Language', axis=1)  # this feature has too many missing values

for df in [train_df, entire_df]:
    for feature in CATEGORICAL_FEATURES:
        df[feature] = df[feature].fillna(MISSING_CAT_VAL)  # impute missing values
        df[feature] = df[feature].apply(lambda x: preprocess(x, stop_words_removal=False, lemmatize=False, min_word_len=0))

### Version 1: One-Hot Encoding
Some models will use different encodings of the categorical features.

In [None]:
from sklearn.preprocessing import OneHotEncoder

OHE = OneHotEncoder(handle_unknown='infrequent_if_exist', min_frequency=3, sparse=False)
OHE.fit(entire_df[CATEGORICAL_FEATURES])

In [None]:
def ohe_transform(df):
    """
    Transforms the dataset by one-hot-encoding, on the categorical features only.
    Returns the transformed DataFrame.
    """
    transformed_mat = OHE.transform(df[CATEGORICAL_FEATURES])
    transformed_cat_df = pd.DataFrame(transformed_mat).set_axis(OHE.get_feature_names_out(), axis=1, inplace=False)
    transformed_df = pd.concat([df.reset_index(drop=True), transformed_cat_df], axis=1)
    transformed_df = transformed_df.drop(CATEGORICAL_FEATURES, axis=1)  # drop the original attributes
    
    return transformed_df

In [None]:
ALPHA = 0.05

# Select one-hot-encoded features using chi2
x2 = SelectKBest(chi2, k='all')
x2.fit(ohe_transform(train_df)[OHE.get_feature_names_out()], train_df[CLASS_LABEL])
pvals = pd.DataFrame(x2.pvalues_, index=x2.feature_names_in_, columns=['p-value'])
# print(pvals)

INSIG_OHE_FEATURES = pvals[pvals['p-value'] >= ALPHA].index.tolist()  # insignificant encoded features
sig_ohe_features = pvals[pvals['p-value'] < ALPHA].index.tolist()
print(len(sig_ohe_features), "features are significant.")
sig_ohe_features

### Version 2: Ordinal Encoding

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ORD_ENCODER = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
ORD_ENCODER.fit(entire_df[CATEGORICAL_FEATURES])

In [None]:
def ord_enc_transform(df):
    """
    Transforms the dataset by one-hot-encoding, on the categorical features only.
    Returns the transformed DataFrame.
    """
    df = df.copy()
    df[CATEGORICAL_FEATURES] = ORD_ENCODER.transform(df[CATEGORICAL_FEATURES])
    for i in range(len(CATEGORICAL_FEATURES)):  
        # some models (e.g. CategoricalNB), cannot handle negative values, so we replace -1 with the next unused int to denote unseen values
        df[CATEGORICAL_FEATURES[i]] = df[CATEGORICAL_FEATURES[i]].replace(-1, ORD_ENCODER.categories_[i].size)
    
    return df

## Discretize 'Numerical' features

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

def discretize(df, feature, discretizer=None, strategy='kmeans', n_bins=5):
    """
    Discretizes the feature in the given DataFrame.
    - discretizer: the discretizer; 
      If None, creates a KBinsDiscretizer for ordinal data, with the specified strategy and number of bins.
    - strategy: the discretization strategy (one of ['kmeans', 'quantile', 'uniform']).
    - n_bins: the number of bins.
    Returns the transformed dataset and discretizer used.
    """
    
    if discretizer is None:
        discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy=strategy)
        discretizer.fit(df[[feature]])
        
    transformed_df = df.copy()
    transformed_df[feature] = discretizer.transform(df[[feature]])
    
    return transformed_df, discretizer

In [None]:
# PublishYear
train_df, DISCRETIZER_PY = discretize(train_df, 'PublishYear', strategy='uniform', n_bins=15)

# PublishMonth
train_df, DISCRETIZER_PM = discretize(train_df, 'PublishMonth', strategy='quantile', n_bins=6)

# PublishDay
train_df, DISCRETIZER_PD = discretize(train_df, 'PublishDay', strategy='quantile', n_bins=11)

# pagesNumber
train_df, DISCRETIZER_PN = discretize(train_df, 'pagesNumber', strategy='kmeans', n_bins=7)

## Text Pre-processing

In [None]:
train_df_name, VECTORIZER_NAME = preprocess_text_feature(train_df, 'Name', ngram=2, delimiter='_')  # can try ngram=1
train_df_desc, VECTORIZER_DESC = preprocess_text_feature(train_df, 'Description', ngram=2)

In [None]:
# Bind them together
train_df = pd.concat([train_df.reset_index(drop=True), train_df_name, train_df_desc], axis=1)
train_df = train_df.drop(TEXT_FEATURES, axis=1)  # drop the original columns

In [None]:
# Finally, move rating_label to the last column
labels = train_df[CLASS_LABEL].astype('category')
train_df = pd.concat([train_df.drop([CLASS_LABEL], axis=1), labels], axis=1) 

## Altogether

In [None]:
def preprocess_test_df(df):
    df = df.copy()
    
    # preprocess string features
    for feature in CATEGORICAL_FEATURES:
        df[feature] = df[feature].fillna(MISSING_CAT_VAL)  # imputation
        df[feature] = df[feature].apply(lambda x: preprocess(str(x), stop_words_removal=False, lemmatize=False, min_word_len=0))
    
    # preprocess numerical features
    df = discretize(df, 'PublishDay', discretizer=DISCRETIZER_PD)[0]
    df = discretize(df, 'PublishMonth', discretizer=DISCRETIZER_PM)[0]
    df = discretize(df, 'PublishYear', discretizer=DISCRETIZER_PY)[0]
    df = discretize(df, 'pagesNumber', discretizer=DISCRETIZER_PN)[0]
    
    # preprocess text features
    df_name = preprocess_text_feature(df, 'Name', vectorizer=VECTORIZER_NAME, delimiter='_')[0]
    df_desc = preprocess_text_feature(df, 'Description', vectorizer=VECTORIZER_DESC)[0]
    
    # discard the obsolete original features and unwanted features
    df = pd.concat([df.reset_index(drop=True), df_name, df_desc], axis=1)
    df = df.drop(['Name', 'Description', 'Language'], axis=1)
        
    return df

In [None]:
# For simplicity, we save the preprocessed datasets
train_df.to_csv(DATASET_DIR + "/train_df.csv", index=False)

# Encoded versions
train_df_ohe = ohe_transform(train_df).drop(INSIG_OHE_FEATURES, axis=1)  # drop the 'insignificant' OHE features
train_df_oe = ord_enc_transform(train_df)
train_df_ohe.to_csv(DATASET_DIR + "/train_df_ohe.csv", index=False)
train_df_oe.to_csv(DATASET_DIR + "/train_df_oe.csv", index=False)

# 2. Building the models

For the first 3 models, we use the one-hot-encoded dataset.

In [None]:
#train_df_ohe = pd.read_csv(DATASET_DIR + "/train_df_ohe.csv", keep_default_na=False)
X_train = train_df_ohe.iloc[:,:-1]
y_train = train_df_ohe.iloc[:,-1]

## 1) Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logr = LogisticRegression(solver='sag', max_iter=500)  # uses Stochastic Average Gradient descent solver

In [None]:
selector = SelectFromModel(logr, prefit=True).fit(X, y)
LR_FEATURES = [X_train.columns[i] for i in selector.get_support(indices=True)]

In [None]:
logr.fit(X_train[LR_FEATURES], y_train)
y_pred_logr, logr_report = cross_val_report(logr, X, y)

## 2) Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
RF_FEATURES = chi2_select_features(X_train, y_train).remove('PublishDay')
rf = RandomForestClassifier(random_state=30027, 
                            max_samples=0.3, max_features='log2', 
                            criterion='entropy', n_estimators=100, 
                            oob_score=True)

In [None]:
rf.fit(X_train[RF_FEATURES], y_train)
y_pred_rf, cross_val_report(rf, X_train[RF_FEATURES], y_train)[1]

In [None]:
rf.oob_score_

## 3) StackingClassifier

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.classifier import StackingCVClassifier

In [None]:
# Gaussian NB
gnb = GaussianNB()
# Linear SVM
linearSVM = svm.LinearSVC(random_state=30027, C=1)
# Logistic Regression
logr = LogisticRegression(solver='sag', max_iter=100)
# Decision tree
dt = DecisionTreeClassifier(max_depth = 400, random_state = 30027)
# 3NN
knn = KNeighborsClassifier(n_neighbors=3)

# Stack
sclf = StackingCVClassifier(classifiers=[gnb, linearSVM, logr, dt, knn], 
                            meta_classifier=LogisticRegression(),
                            cv=2,
                            random_state=30027)

sclf.fit(X_train, y_train)

In [None]:
y_pred_sclf, sclf_report = cross_val_report(sclf, X_train, y_train)
sclf_report

## 4) Stacking Naive Bayes

In [None]:
train_df_oe = pd.read_csv(DATASET_DIR + "/train_df_oe.csv", keep_default_na=False)
y_train_oe = train_df_oe.iloc[:,-1]
X_train_oe = train_df_oe.iloc[:,:-1]

In [None]:
from mlxtend.classifier import StackingCVClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB, CategoricalNB

In [None]:
GAUS_FEATURES = ['pagesNumber', 'PublishYear']
CAT_FEATURES = ['Authors']
MN_FEATURES = chi2_select_features(X_train_oe[X.columns[6:]], y_train_oe)  # only filtering the text features
N_AUTHORS_VALS = len(ORD_ENCODER.categories_[0])  # number of unique Authors categories

X_train_oe = X_train_oe[GAUS_FEATURES + CAT_FEATURES + MN_FEATURES]
X_train_oe

In [None]:
gnb = make_pipeline(ColumnSelector(GAUS_FEATURES),
                      GaussianNB())
cnb = make_pipeline(ColumnSelector(CAT_FEATURES),
                      CategoricalNB(alpha=0.11, min_categories=N_AUTHORS_VALS))
mnb = make_pipeline(ColumnSelector(MN_FEATURES),
                      MultinomialNB(alpha=0.26))

cvsnb = StackingCVClassifier(classifiers=[gnb, cnb, mnb], 
                            meta_classifier=CategoricalNB(),
                            random_state=30027)

cvsnb.fit(X_train_oe, y_train_oe)

In [None]:
y_pred_cvsnb, cvsnb_report = cross_val_report(cvsnb, X_train_oe, y_train_oe)
cvsnb_report

# 3. Final predictions on the test set

In [None]:
test_df = preprocess_df(test_df)
test_df_ohe = ohe_transform(test_df)

In [44]:
X_test = ohe_transform(test_df).drop(INSIG_OHE_FEATURES, axis=1)

In [45]:
sclf.predict(X_test)

NameError: name 'snb' is not defined