In [1]:
%run Global.ipynb
%run Text_Preprocessing.ipynb
%run Helper_Functions.ipynb

Documentation:
1. Converted class label to categorical
2. Supplied (original) vectorisers + TF-IDF vectorizer
3. Added preprocessing (punctuation removal, casefolding, stop-words removal (including words to short), lemmatization) steps to the text features
4. Encoded the simple string features ('Authors', 'Publisher', 'Language')
5. Dimensionality reduction with TruncatedSVD
6. Bigrams instead of words
7. Feature selection with SelectKBest
8. Standardization of 'pageNumbers' 
9. Instead of 8: Take the log of the standardized 'pageNumbers' using MinMaxScaler

Yet to try:
* Discretisation of PageNumbers

In [2]:
# Load datasets
original_train_df = pd.read_csv(r"book_rating_train.csv", index_col = False, delimiter = ',', header=0)
original_test_df = pd.read_csv(r"book_rating_test.csv", index_col = False, delimiter = ',', header=0)
print("Training set size:", len(original_train_df))
print("Test set size:", len(original_test_df))

FileNotFoundError: [Errno 2] No such file or directory: 'book_rating_train.csv'

In [None]:
train_df = original_train_df.copy()
test_df = original_test_df.copy()

# 1. Preprocessing

In [None]:
STRING_FEATURES = ['Authors', 'Publisher', 'Language']  # categorical
NUMERICAL_FEATURES = ['PublishYear', 'PublishMonth', 'PublishDay', 'pagesNumber']  # not necessarily continuous, but are 'numbers'
TEXT_FEATURES = ['Name', 'Description']

In [None]:
# Check class distributions
class_labels, counts = np.unique(original_train_df[CLASS_LABEL], return_counts=True)
for label, count in zip(class_labels, counts):
    print(f"Rating {label}: {count} instances")

## Preprocess String features: Language, Authors, and Publisher

In [None]:
for feature in STRING_FEATURES:
    print(f"There are {original_train_df[feature].nunique()} unique '{feature}' values.")
    print(f"There are {original_train_df[feature].isna().sum()} missing '{feature}' values.\n")

# Too many missing values in 'Language'. Let's drop it.
train_df = train_df.drop('Language', axis=1)

In [None]:
# For Authors and Publisher, we just do a general text preprocessing here
# The models will later choose their own version of preprocessed dataset
CATEGORICAL_FEATURES = ['Authors', 'Publisher']
MISSING_CAT_VAL = ''
entire_df = pd.concat([train_df, test_df])

for df in [train_df, entire_df]:
    for feature in CATEGORICAL_FEATURES:
        df[feature] = df[feature].fillna(MISSING_CAT_VAL)  # imputation
        df[feature] = df[feature].apply(lambda x: preprocess(x, stop_words_removal=False, lemmatize=False, min_word_len=0))
        

### Option 1: One-Hot Encoder

In [None]:
from sklearn.preprocessing import OneHotEncoder

OHE = OneHotEncoder(handle_unknown='infrequent_if_exist', min_frequency=3, sparse=False)
OHE.fit(entire_df[CATEGORICAL_FEATURES])

In [None]:
def ohe_transform(df, has_labels=False):
    """
    Transforms the dataset by one-hot-encoding (on the categorical features).
    `has_label` indicates whether the DataFrame contains the class labels.
        If True, these are moved to the last column.
    Returns the transformed DataFrame.
    """
    transformed_mat = OHE.transform(df[CATEGORICAL_FEATURES])
    transformed_cat_df = pd.DataFrame(transformed_mat).set_axis(OHE.get_feature_names_out(), axis=1, inplace=False)
    transformed_df = pd.concat([df.reset_index(drop=True), transformed_cat_df], axis=1)
    transformed_df = transformed_df.drop(CATEGORICAL_FEATURES, axis=1)  # drop the original attributes
    
    if has_labels:
        # move rating_label to the last column
        labels = transformed_df[CLASS_LABEL]
        transformed_df = pd.concat([transformed_df.drop([CLASS_LABEL], axis=1), labels], axis=1)
    
    return transformed_df

In [None]:
# Select one-hot-encoded features using chi2
chi2_selector = SelectKBest(chi2, k='all')
chi2_selector.fit(ohe_transform(train_df)[OHE.get_feature_names_out()], train_df[CLASS_LABEL])
pvals = pd.DataFrame(chi2_selector.pvalues_, index=chi2_selector.feature_names_in_, columns=['p-value'])
pvals

In [None]:
# Remember these features for later
insig_cat_features_ohe = pvals[pvals['p-value'] >= ALPHA].index.tolist()
sig_cat_features_ohe = pvals[pvals['p-value'] < ALPHA].index.tolist()
print(len(sig_cat_features_ohe), "features are significant.")
sig_cat_features_ohe

### Option 2: Ordinal Encoder

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ORD_ENCODER = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
ORD_ENCODER.fit(entire_df[CATEGORICAL_FEATURES])

In [None]:
def ord_enc_transform(df):
    df = df.copy()
    df[CATEGORICAL_FEATURES] = ORD_ENCODER.transform(df[CATEGORICAL_FEATURES])
    for i in range(len(CATEGORICAL_FEATURES)):  
        # some models (e.g. CategoricalNB), cannot handle negative values, so we replace -1 with the next unused int to denote unseen values
        df[CATEGORICAL_FEATURES[i]] = df[CATEGORICAL_FEATURES[i]].replace(-1, ORD_ENCODER.categories_[i].size)
    
    return df

In [None]:
# Check chi2
chi2_selector = SelectKBest(chi2, k='all')
chi2_selector.fit(ord_enc_transform(train_df)[['Authors', 'Publisher']], train_df[CLASS_LABEL])
pd.DataFrame(chi2_selector.pvalues_, index=chi2_selector.feature_names_in_, columns=['p-value'])
# Looks like we can keep both features using ordinal encoding

In [None]:
[len(ORD_ENCODER.categories_[i]) for i in range(len(CATEGORICAL_FEATURES))]

## Preprocess Numerical features: PublishDates and pagesNumber

In [None]:
# Scatterplots for Publish year, month, and day vs. Rating
for feature in ['PublishYear', 'PublishMonth', 'PublishDay']: 
    scatter_vs_rating(original_train_df, feature)
    
    # Look at the average rating for each value
    values, counts = np.unique(sorted(original_train_df[feature]), return_counts=True)
    avg = [sum(original_train_df.loc[original_train_df[feature] == values[i]][CLASS_LABEL]) / counts[i] for i in range(len(values))]
    plt.plot(values, avg, color = 'red')
    plt.show()

In [None]:
# Histogram for pages number < 2000
hist_plot(original_train_df.loc[original_train_df['pagesNumber'] < 2000], 'pagesNumber')

In [None]:
scatter_vs_rating(original_train_df, 'pagesNumber')
plt.show()

In [None]:
# Look at the correlation matrix of the numerical attributes
cor_matrix = original_train_df[NUMERICAL_FEATURES + [CLASS_LABEL]].corr()
round(cor_matrix, 2)
plt.figure(figsize=(12, 8))
sns.heatmap(cor_matrix, cmap='cividis', annot=True, linewidths=2)
plt.show()

In [None]:
# Order: ['PublishYear', 'PublishMonth', 'PublishDay', 'pagesNumber']
mutual_info_classif(train_df[NUMERICAL_FEATURES], train_df[CLASS_LABEL], discrete_features=[True, True, True, False])

In [None]:
for feature in NUMERICAL_FEATURES:
    print(f"AMI for {feature} =", adjusted_mutual_info_score(train_df[feature], train_df[CLASS_LABEL]))

In [None]:
# MI: top 2 = PublishYear, pagesNumber
# Correlation matrix: top 2 = PublishYear, pagesNumber
# Discard PublishMonth, PublishDay
#train_df = train_df.drop(['PublishMonth', 'PublishDay'], axis=1)

In [None]:
# Try discretizing it
from sklearn.preprocessing import KBinsDiscretizer

def discretize(df, feature, discretizer=None, strategy='kmeans', n_bins=5):
    if discretizer is None:
        discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy=strategy)
        discretizer.fit(df[[feature]])
        
    transformed_df = df.copy()
    transformed_df[feature] = discretizer.transform(df[[feature]])
    
    return transformed_df, discretizer

In [None]:
DISCRETIZATION_STRATEGIES = ['kmeans', 'quantile', 'uniform']

def tune_discretization(X, y, feature, lower, upper):
    """
    Returns the combination of n (number of bins) and discretisation strategy 
    that gives the highest normalised mutual information between `X` and `y`.
    """
    
    max_ami = 0
    best_n = 0
    best_strategy = None
    
    for n in range(lower, upper+1):
        for strategy in DISCRETIZATION_STRATEGIES:
            transformed_X, _ = discretize(X, feature, strategy=strategy, n_bins=n)
            ami = adjusted_mutual_info_score(transformed_X[feature], y)
            if ami > max_ami:
                max_ami = ami
                best_n = n
                best_strategy = strategy
    
    return max_ami, best_n, best_strategy

In [None]:
nbins_upper = {'PublishYear': train_df['PublishYear'].nunique(), 'PublishMonth': 12, 'PublishDay': 31, 'pagesNumber': 30}

for feature in NUMERICAL_FEATURES:
    max_ami, best_n, best_strategy = tune_discretization(train_df, train_df[CLASS_LABEL], feature, 2, nbins_upper[feature])
    print("Feature:", feature)
    print("The number of bins that gives the highest AMI with rating_label is", best_n)
    print("The highest AMI is:", max_ami)
    print("The best strategy is:", best_strategy)
    print("\n")

# All AMI increased slightly after discretization

In [None]:
train_df, DISCRETIZER_PY = discretize(train_df, 'PublishYear', strategy='uniform', n_bins=15)
np.unique(train_df['PublishYear'], return_counts=True)

In [None]:
scatter_vs_rating(train_df, 'PublishYear')

In [None]:
train_df, DISCRETIZER_PM = discretize(train_df, 'PublishMonth', strategy='quantile', n_bins=6)
np.unique(train_df['PublishMonth'], return_counts=True)

In [None]:
scatter_vs_rating(train_df, 'PublishMonth')

In [None]:
train_df, DISCRETIZER_PD = discretize(train_df, 'PublishDay', strategy='quantile', n_bins=11)
np.unique(train_df['PublishDay'], return_counts=True)

In [None]:
scatter_vs_rating(train_df, 'PublishDay')

In [None]:
train_df, DISCRETIZER_PN = discretize(train_df, 'pagesNumber', strategy='kmeans', n_bins=7)
np.unique(train_df['pagesNumber'], return_counts=True)

In [None]:
scatter_vs_rating(train_df, 'pagesNumber')

## Text Pre-processing

In [None]:
# We use a different delimiter for the name and description bigrams, to prevent duplicate column namaes
train_df_name, VECTORIZER_NAME = preprocess_text_feature(train_df, 'Name', ngram=2, delimiter='_', max_features=300)  # can try ngram=1
train_df_desc, VECTORIZER_DESC = preprocess_text_feature(train_df, 'Description', ngram=2, max_features=300)
# print(sorted(vectorizer_name.vocabulary_))
# print(sorted(vectorizer_desc.vocabulary_))

In [None]:
# Bind them together
train_df = pd.concat([train_df.reset_index(drop=True), train_df_name, train_df_desc], axis=1)
train_df = train_df.drop(TEXT_FEATURES, axis=1)  # drop the original columns

## Convert rating_label to categorical class label

In [None]:
# Finally, convert rating_label to categorical class label
train_df[CLASS_LABEL] = train_df[CLASS_LABEL].astype('category')
# move rating_label to the last column
labels = train_df[CLASS_LABEL]
train_df = pd.concat([train_df.drop([CLASS_LABEL], axis=1), labels], axis=1) 

In [None]:
DTYPE = train_df.dtypes.apply(lambda x: x.name).to_dict()  # needed for reading the CSV later
DTYPE['Authors'] = 'string'
DTYPE['Publisher'] = 'string'

In [None]:
# Save the general transformed df
train_df.to_csv(DATASET_DIR + "/train_df_50.csv", index=False)

In [None]:
print(train_df)

## Preprocessing: Altogether

In [None]:
def preprocess_test_df(df):
    df = df.copy()
    
    # preprocess string features
    for feature in CATEGORICAL_FEATURES:
        df[feature] = df[feature].fillna(MISSING_CAT_VAL)  # imputation
        df[feature] = df[feature].apply(lambda x: preprocess(str(x), stop_words_removal=False, lemmatize=False, min_word_len=0))
    
    # preprocess pagesNumber and PublishYear
    df = discretize(df, 'PublishDay', discretizer=DISCRETIZER_PD)[0]
    df = discretize(df, 'PublishMonth', discretizer=DISCRETIZER_PM)[0]
    df = discretize(df, 'PublishYear', discretizer=DISCRETIZER_PY)[0]
    df = discretize(df, 'pagesNumber', discretizer=DISCRETIZER_PN)[0]
    
    # preprocess text features
    df_name = preprocess_text_feature(df, 'Name', vectorizer=VECTORIZER_NAME, delimiter='_', max_features=300)[0]
    df_desc = preprocess_text_feature(df, 'Description', vectorizer=VECTORIZER_DESC, max_features=300)[0]
    
    # discard the obsolete original features and unwanted features
    df = pd.concat([df.reset_index(drop=True), df_name, df_desc], axis=1)
    df = df.drop(['Name', 'Description', 'Language'], axis=1)
        
    return df

In [None]:
test_df = preprocess_test_df(original_test_df)
test_df.to_csv(DATASET_DIR + "/test_df.csv", index=False)
test_df

In [None]:
# Now, we keep four versions of the transformed datasets
# a) Original
# b) Using one-hot encoding, with the full set of 'Authors' and 'Publisher' features
# c) Using one-hot encoding, with the chi2-selected set of 'Authors' and 'Publisher' features
# d) Using ordinal encoding
# Each model can choose their own version of dataset
train_df_ohe_full = ohe_transform(train_df, has_labels=True)
train_df_ohe_selected = train_df_ohe_full.drop(insig_cat_features_ohe, axis=1)  # drop the 'insignificant' OHE features
train_df_oe = ord_enc_transform(train_df)

train_df_ohe_full.to_csv(DATASET_DIR + "/train_df_ohe_full.csv", index=False)
train_df_ohe_selected.to_csv(DATASET_DIR + "/train_df_ohe_selected.csv", index=False)
train_df_oe.to_csv(DATASET_DIR + "/train_df_oe_50.csv", index=False)

In [None]:
test_df_ohe_full = ohe_transform(test_df)
test_df_ohe_selected = test_df_ohe_full.drop(insig_cat_features_ohe, axis=1)  # drop the 'insignificant' OHE features
test_df_oe = ord_enc_transform(test_df)

test_df_ohe_full.to_csv(DATASET_DIR + "/test_df_ohe_full.csv", index=False)
test_df_ohe_selected.to_csv(DATASET_DIR + "/test_df_ohe_selected.csv", index=False)
test_df_oe.to_csv(DATASET_DIR + "/test_df_oe.csv", index=False)

# 3. Final predictions on the test set

In [None]:
X_test_ohe = ohe_transform(test_df)
X_test_oe = ord_enc_transform(test_df)

In [None]:
model_sets = {snb: X_test_oe, RBF_svm: X_test_ohe, logr: X_test_ohe, hgb: X_test_oe}
y_test_preds = []
for model in model_sets:
    y_test_pred = model.predict(model_sets[model])
    y_test_preds.append(y_test_pred)


In [None]:
y_test_preds