# COMP9727 Recommender Systems 24T2 Assignment

Name: Jinghan Wang  
Student ID: z5286124

In [None]:
import pandas as pd
import nltk 
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from tabulate import tabulate
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics.pairwise import cosine_similarity
import os
import numpy as np
import re

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

## Part 1 Topic (Genre) Classification
### Data Preprocessing

In [None]:
# Load the dataset
dictionary = dict()
file_list = [os.path.join('dataset', file) for file in os.listdir('dataset') if file.endswith('.tsv')]

# Concatenate all the tsv files into one dataframe
df = pd.concat([pd.read_csv(file, sep='\t') for file in file_list], ignore_index=True)

# Combine all columns into one column 'text'
df['text'] = df['Title'].astype(str) + ' ' + df['Release Year'].astype(str) + ' ' + df['Genre'].astype(str) + ' ' + df['Director'].astype(str) + ' ' + df['Cast'].astype(str) + ' ' + df['Plot'].astype(str) + ' ' + df['Origin/Ethnicity'].astype(str)

ps = PorterStemmer()
nltk_stop_words = set(stopwords.words('english'))
sklearn_stop_words = set(ENGLISH_STOP_WORDS)

def preprocess_text(text, true=False, stop_words=nltk_stop_words):
    text = text.lower()
    if true:
        text = re.sub(r'[^\w\s]', '', text)
    else:
        text = re.sub(r'[^\w\s\'.,!?@#&$%\-+*/=]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [ps.stem(word) for word in tokens]
    return ' '.join(tokens)

### Evaluation of Punctuation Retention in NLP Models

In the tutorial code, the expression `[^\\w\\s]` is used to remove all characters that are not spaces or alphanumeric. However, in the context of **NLP**, punctuation can convey specific meanings, and symbols like `$`, `=` etc., have special significance. Removing them entirely can affect the effectiveness of NLP. Therefore, I tested two scenarios using the **MNB** and **BNB** models: one where all punctuation is removed, and one where punctuation is retained. Using _5-fold cross-validation_, the mean and standard deviation of **f1_macro**, **f1_micro**, and **accuracy** were calculated to evaluate the effectiveness in both cases.

#### Scenario 1: Remove all punctuation

In [None]:
df['text'] = df['text'].apply(lambda text : preprocess_text(text, True))

# Use cross-validation to evaluate the performance of the models, instead of a single train-test split
cv= StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

bnb = Pipeline(steps=[('preprocessor', CountVectorizer()), ('classifier', BernoulliNB())])
mnb = Pipeline(steps=[('preprocessor', CountVectorizer()), ('classifier', MultinomialNB())])

def get_list(mnb_score, bnb_score):
    return [
        cross_val_score(bnb_score, df['text'], df['Genre'], cv=cv, scoring='f1_macro'),
        cross_val_score(mnb_score, df['text'], df['Genre'], cv=cv, scoring='f1_macro'),
        cross_val_score(bnb_score, df['text'], df['Genre'], cv=cv, scoring='accuracy'),
        cross_val_score(mnb_score, df['text'], df['Genre'], cv=cv, scoring='accuracy'),
        cross_val_score(bnb_score, df['text'], df['Genre'], cv=cv, scoring='f1_micro'),
        cross_val_score(mnb_score, df['text'], df['Genre'], cv=cv, scoring='f1_micro'),
    ]

remove = get_list(mnb, bnb)

#### Scenario 2: Retain all punctuation

In [None]:
df['text'] = df['text'].apply(lambda text : preprocess_text(text, False))
retain = get_list(mnb, bnb)

In [None]:
def print_scores(first, second, item1, item2, t):
    first = {
        "BernoulliNB": {
            "Macro F1": first[0].mean(),
            "Micro F1": first[4].mean(),
            "Accuracy": first[2].mean()
        },
        "MultinomialNB": {
            "Macro F1": first[1].mean(),
            "Micro F1": first[5].mean(),
            "Accuracy": first[3].mean()
        }
    }
    
    second = {
        "BernoulliNB": {
            "Macro F1": second[0].mean(),
            "Micro F1": second[4].mean(),
            "Accuracy": second[2].mean()
        },
        "MultinomialNB": {
            "Macro F1": second[1].mean(),
            "Micro F1": second[5].mean(),
            "Accuracy": second[3].mean()
        }
    }
    
    table = []
    for m, scores in first.items():
        table.append([m, item1, 
                      f"{scores['Macro F1']:.4f}", 
                      f"{scores['Micro F1']:.4f}", 
                      f"{scores['Accuracy']:.4f}"])

    for m, scores in second.items():
        table.append([m, item2, 
                      f"{scores['Macro F1']:.4f}", 
                      f"{scores['Micro F1']:.4f}", 
                      f"{scores['Accuracy']:.4f}"])

    headers = ["Model", t, "Macro F1 Mean", "Micro F1 Mean", "Accuracy Mean"]
    
    table.sort(key=lambda x: x[0])
    
    print(tabulate(table, headers, tablefmt="grid"))

print_scores(remove, retain, "Remove Punctuation", "Retain Punctuation", "Punctuation")

#### Conclusion
From the results, we can draw the following conclusions:

- Retaining punctuation provided a slight improvement in the performance of **BNB**, as seen in the small increase in the mean scores and a slight reduction in the standard deviation.
- For **MNB**, retaining punctuation did not significantly affect performance, as the differences in mean scores and standard deviations were minimal.

Therefore, we will retain punctuation in the following analysis.

### Evaluation of Different Stop Words in NLP Models

One of the primary differences between **NLTK** and **Scikit-learn** in handling stop words lies in the size and scope of their respective stop words lists. This divergence stems from the distinct goals and applications each library is designed to serve.

Size and Scope
- **NLTK**: **NLTK** provides a larger and more comprehensive list of stop words, including many common words that might not be relevant in certain contexts.
- **Scikit-learn**: **Scikit-learn** offers a smaller, more curated list of stop words tailored for machine learning applications, focusing on words that are likely to be less informative for models.

The choice of stop words list has a significant impact on the performance of NLP models. In this analysis, I will evaluate the performance of **MNB** and **BNB** models using two different stop words lists: **NLTK** and **Scikit-learn**. Similar to the previous tests, I will retain all punctuation and use *5-fold* *cross-validation* to compute the mean and standard deviation of **f1_macro**, **f1_micro**, and **accuracy** to assess the performance of the two stop words lists.

#### Scenario 1: NLTK Stop Words

In [None]:
df['text'] = df['text'].apply(lambda text : preprocess_text(text, nltk_stop_words))
nltk_s = get_list(mnb, bnb)

#### Scenario 2: Scikit-learn Stop Words

In [None]:
df['text'] = df['text'].apply(lambda text : preprocess_text(text, sklearn_stop_words))
sklearn_s = get_list(mnb, bnb)

In [None]:
print_scores(nltk_s, sklearn_s, "NLTK Stop Words", "Scikit-learn Stop Words", "Stop Words")

#### Conclusion
From the results, we can draw the following conclusions:

The choice of stop words list (**NLTK** vs. **Scikit-learn**) has a marginal impact on the performance of **BNB** and almost no impact on **MNB**. Given the slight edge in Macro F1 and the negligible difference in other metrics, **BNB** with **NLTK** stop words can be considered marginally better. However, the overall difference is minimal, and both stop words lists are suitable for use with these models in **NLP** tasks.

Therefore, we will use the **NLTK** stop words list in the following analysis.

### Evaluation of Top-N Words in NLP Models

In this analysis, I will evaluate the impact of the number of top words (features) on the performance of **BNB** and **MNB** models. I choose the micro **F1 score** as the key score to evaluate, which can provide a comprehensive measure by accounting for each instance equally, ensuring a robust evaluation across all classes in the context of **Top-N** feature selection.

I will vary the number of top words from $1$ to $50,000$ in increments of $1000$ and evaluate the performance of the models using *5-fold cross-validation*. The mean micro **F1 score** will be calculated for each model at different **Top-N** values to determine the optimal number of features for the models.

In [None]:
meanBNB = []
meanMNB = []
df['text'] = df['text'].apply(preprocess_text)
r = range(1, 50000, 1000)

for k in r:
    bnb = Pipeline(steps=[('preprocessor', CountVectorizer(max_features=k)), ('classifier', BernoulliNB())])
    mnb = Pipeline(steps=[('preprocessor', CountVectorizer(max_features=k)), ('classifier', MultinomialNB())])
    
    meanBNB.append(cross_val_score(bnb, df['text'], df['Genre'], cv=cv, scoring='f1_micro').mean())
    meanMNB.append(cross_val_score(mnb, df['text'], df['Genre'], cv=cv, scoring='f1_micro').mean())

In [None]:
plt.figure(figsize=(14, 8))
plt.plot(r, meanBNB, label='BNB')
plt.plot(r, meanMNB, label='MNB')
plt.xlabel('Max_Features')
plt.ylabel('Mean F1 Score')
plt.legend()
plt.grid(True)

According to the plots, it appears that higher scores are achieved when the maximum features are between $1$ and $2000$. To get more precise metrics within this range, we will conduct an additional test with increments of $100$.

In [None]:
meanBNB = []
meanMNB = []
r = range(1, 2000, 100)

for k in r:
    bnb = Pipeline(steps=[('preprocessor', CountVectorizer(max_features=k)), ('classifier', BernoulliNB())])
    mnb = Pipeline(steps=[('preprocessor', CountVectorizer(max_features=k)), ('classifier', MultinomialNB())])
    
    meanBNB.append(cross_val_score(bnb, df['text'], df['Genre'], cv=cv, scoring='f1_micro').mean())
    meanMNB.append(cross_val_score(mnb, df['text'], df['Genre'], cv=cv, scoring='f1_micro').mean())

In [None]:
plt.figure(figsize=(14, 8))
plt.plot(r, meanBNB, label='BNB')
plt.plot(r, meanMNB, label='MNB')
plt.xlabel('Max_Features')
plt.ylabel('Mean F1 Score')
plt.legend()
plt.grid(True)

### Analysis of Optimal Max Features

From the additional testing conducted with `max_features` ranging from $1$ to $2000$ in increments of $100$, we observe the following trends:

- **Bernoulli Naive Bayes (BNB)**:
  - The mean micro F1 score for BNB peaks at around 500 max features.
  - Beyond 500 max features, the scores tend to gradually decline, indicating that including more features does not necessarily improve model performance and may even introduce noise.

- **Multinomial Naive Bayes (MNB)**:
  - The mean micro F1 score for MNB also performs well around 500 max features, although it is slightly lower than its highest score observed.
  - This suggests that while MNB can potentially benefit from slightly more features, the performance gain is marginal beyond 500 features.

### Conclusion

Given these observations, we can approximate that limiting the model to the top $500$ most frequent words is a reasonable and effective design choice. This balance helps maintain high model performance while avoiding the inclusion of excessive and potentially noisy features.

### Evaluating BNB and MNB Models
#### 

In [None]:
bnb = Pipeline(steps=[('preprocessor', CountVectorizer(max_features=500)), ('classifier', BernoulliNB())])
bnb_score_micro = cross_val_score(bnb, df['text'], df['Genre'], cv=cv, scoring='f1_micro')
bnb_score_macro = cross_val_score(bnb, df['text'], df['Genre'], cv=cv, scoring='f1_macro')
bnb_score_acc = cross_val_score(bnb, df['text'], df['Genre'], cv=cv, scoring='accuracy')

In [None]:
mnb = Pipeline(steps=[('preprocessor', CountVectorizer(max_features=500)), ('classifier', MultinomialNB())])
mnb_score_micro = cross_val_score(mnb, df['text'], df['Genre'], cv=cv, scoring='f1_micro')
mnb_score_macro = cross_val_score(mnb, df['text'], df['Genre'], cv=cv, scoring='f1_macro')
mnb_score_acc = cross_val_score(mnb, df['text'], df['Genre'], cv=cv, scoring='accuracy')

In [None]:
def print_plot(score1, score2, t):
    plt.plot(score1, color='r', label='BNB')
    plt.plot(score2, color='b', label='MNB')
    plt.xlabel('Fold')
    plt.ylabel('Scode')
    plt.title(t)
    plt.axhline(y=score1.mean(), linestyle='--', color='r', label='BNB mean')
    plt.axhline(y=score2.mean(), linestyle='--', color='b', label='MNB mean')
    plt.grid(True)
    plt.legend()

plt.figure(figsize=(14, 8))
plt.subplot(2, 2, 1)
print_plot(bnb_score_micro, mnb_score_micro, 'Comparison of BNB and MNB Models on F1 Micro Score')
plt.subplot(2, 2, 2)
print_plot(bnb_score_macro, mnb_score_macro, 'Comparison of BNB and MNB Models on F1 Macro Score')
plt.subplot(2, 2, 3)
print_plot(bnb_score_acc, mnb_score_acc, 'Comparison of BNB and MNB Models on Accuracy')

plt.tight_layout()
plt.show()  

#### Conclusion
Based on the plots and the data in the table, we can observe that Bernoulli Naive Bayes (**BNB**) scores are generally higher than those of Multinomial Naive Bayes (**MNB**) across various metrics, both for most subsets and on average.

Hypothesis on **BNB** vs. **MNB** Performance

- Bernoulli Naive Bayes:   
This model works with binary features, meaning it only considers whether a word is present in a document, not how often it appears.  This binary approach is particularly effective when the mere presence of certain keywords strongly indicates the class, regardless of their frequency.
- Multinomial Naive Bayes:   
This model relies on the frequency of words.  While this can be beneficial when the frequency of terms is a strong class indicator, it might be less effective when the presence of terms is more important than their frequency.

### New Model: Gradient Boosting Machine (GBM)

Advantages:
- It Often results in better performance compared to random forests and individual decision trees.
- Effective in handling various types of data including categorical and numerical.

When working with Gradient Boosting Machines (**GBM**), one crucial hyperparameter to tune is the number of boosting stages, specified by n_estimators. Increasing n_estimators generally enhances model performance by allowing the model to learn more complex patterns through additional combinations. However, too many boosting stages can lead to overfitting, where the model performs well on training data but poorly on unseen data.

To find an optimal value for n_estimators, we can evaluate the model’s performance (using metrics such as the **micro F1 score**) across a range of boosting stages from $1$ to $201$ in increments of $20$. 

In [None]:
estimator_list = []

for i in range(1, 201, 20):
    gbm = Pipeline(steps=[('preprocessor', CountVectorizer(max_features=500)), ('classifier', GradientBoostingClassifier(n_estimators=i))])
    estimator_list.append(cross_val_score(gbm, df['text'], df['Genre'], cv=cv, scoring='f1_micro').mean())
    
plt.figure(figsize=(14, 8))
plt.plot(range(1, 201, 20), estimator_list, label='GBM')
plt.xlabel('Estimators')
plt.ylabel('Mean F1 Score')
plt.legend()
plt.grid(True)

Based on the detailed analysis and incremental tests, it is observed that the Gradient Boosting Machine (**GBM**) achieves optimal performance when n_estimators are set $60$. Within this range, the scores remain consistently high, and beyond this range, the performance starts to decline. Therefore, we will select the Median value `n_estimators=60` as the hyperparameter for **GBM** and compare its performance with **BNB** and **MNB**.


In [None]:
gbm = Pipeline(steps=[('preprocessor', CountVectorizer(max_features=500)), ('classifier', GradientBoostingClassifier(n_estimators=60))])
gbm_score_micro = cross_val_score(gbm, df['text'], df['Genre'], cv=cv, scoring='f1_micro')
gbm_score_macro = cross_val_score(gbm, df['text'], df['Genre'], cv=cv, scoring='f1_macro')
gbm_score_acc = cross_val_score(gbm, df['text'], df['Genre'], cv=cv, scoring='accuracy')

In [None]:
def print_plot(score1, score2, score3, t):
    plt.plot(score1, color='r', label='BNB')
    plt.plot(score2, color='b', label='MNB')
    plt.plot(score3, color='g', label='GBM')
    plt.xlabel('Fold')
    plt.ylabel('Scode')
    plt.title(t)
    plt.axhline(y=score1.mean(), linestyle='--', color='r', label='BNB mean')
    plt.axhline(y=score2.mean(), linestyle='--', color='b', label='MNB mean')
    plt.axhline(y=score3.mean(), linestyle='--', color='g', label='GBM mean')
    plt.grid(True)
    plt.legend()

plt.figure(figsize=(14, 8))
plt.subplot(2, 2, 1)
print_plot(bnb_score_micro, mnb_score_micro, gbm_score_micro, 'Comparison of GBM, BNB and MNB Models on F1 Micro Score')
plt.subplot(2, 2, 2)
print_plot(bnb_score_macro, mnb_score_macro, gbm_score_macro, 'Comparison of GBM, BNB and MNB Models on F1 Macro Score')
plt.subplot(2, 2, 3)
print_plot(bnb_score_acc, mnb_score_acc, gbm_score_acc, 'Comparison of GBM, BNB and MNB Models on Accuracy')

plt.tight_layout()
plt.show()  

#### Conclusion
The results indicate that the **GBM** model scores are significantly higher than both the **MNB** and **BNB** models.

Hypothesis on the **GBM** Excellent Performance 
- Boosting Mechanism:   
**GBM** builds models sequentially, where each new model attempts to correct the errors of the previous ones.  This iterative approach allows **GBM** to handle complex patterns in the data more effectively than a single model.
- Combination of Weak Learners:   
By combining multiple weak learners, **GBM** creates a strong predictive model that reduces both bias and variance.

## Recommendation Methods
### Data Preprocessing
Use **NLTK** stop words and retain all punctuation. Preprocess the text data by converting it to lowercase, removing stop words, and stemming the words.

In [None]:
ps = PorterStemmer()
nltk_stop_words = [set(stopwords.words('english'))]

# Load the dataset
dictionary = dict()

# Split the data into training and validation sets
train = []
valid = []
for file in os.listdir('dataset'):
    if file.endswith('tsv'):
        content = pd.read_csv(os.path.join('dataset', file), sep='\t')
        train.append(content.iloc[:150])
        valid.append(content.iloc[150:])

train = pd.concat(train, ignore_index=True)
valid = pd.concat(valid, ignore_index=True)

train['text'] = train['Title'].astype(str) + ' ' + train['Release Year'].astype(str) + ' ' + train['Genre'].astype(str) + ' ' + train['Director'].astype(str) + ' ' + train['Cast'].astype(str) + ' ' + train['Plot'].astype(str) + ' ' + train['Origin/Ethnicity'].astype(str)

valid['text'] = valid['Title'].astype(str) + ' ' + valid['Release Year'].astype(str) + ' ' + valid['Genre'].astype(str) + ' ' + valid['Director'].astype(str) + ' ' + valid['Cast'].astype(str) + ' ' + valid['Plot'].astype(str) + ' ' + valid['Origin/Ethnicity'].astype(str)

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s\'.,!?@#&$%\-+*/=]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in nltk_stop_words]
    tokens = [ps.stem(word) for word in tokens]
    return ' '.join(tokens)

train['text'] = train['text'].apply(preprocess_text)
valid['text'] = valid['text'].apply(preprocess_text)

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(train['text'])
X_test = vectorizer.transform(valid['text'])

train['Vector'] = list(X_train)
valid['Vector'] = list(X_test)

### Building Model (Gradient Boosting Classifier)

Use the hyperparameter which calculated in the previous section.
`n_estimators = 60` 

In [None]:
n=60
model = Pipeline(steps=[('preprocessor', vectorizer), ('classifier', GradientBoostingClassifier(n_estimators=n))])
model.fit(train['text'], train['Genre'])
train['pred'] = model.predict(train['text'])
valid['pred'] = model.predict(valid['text'])

### Processing the User Data

This code constructs a user profile by generating a dictionary that contains the term frequency vectors for each category the user likes. Specifically, the code filters documents in each category that match the user’s keywords, calculates the average TF-IDF vector for these relevant documents, and stores these average vectors in the user profile.

In [None]:
def create_user_profile(user, data):
    user_profile = {}
    for _, dframe in user.iterrows():
        genre_docs = data[data['pred'] == dframe['Genre']]
        if genre_docs.empty:
            continue
        relevant_docs = genre_docs[genre_docs['text'].apply(lambda x: any(word in dframe['Keywords'].split() for word in x.split()))]
        if relevant_docs.empty:
            user_profile[dframe['Genre']] = np.zeros(vectorizer.transform(['']).shape[1])
        else:
            vectors = np.array([vec.toarray().flatten() for vec in relevant_docs['Vector']])
            user_profile[dframe['Genre']] = np.mean(vectors, axis=0)
            
    return user_profile

In [None]:
user1 = pd.read_csv(os.path.join('./', 'user1.tsv'), sep='\t', header=None, names=['Genre', 'Keywords'])
user2 = pd.read_csv(os.path.join('./', 'user2.tsv'), sep='\t', header=None, names=['Genre', 'Keywords'])

user1_profile = create_user_profile(user1, train)
user2_profile = create_user_profile(user2, train)

In [None]:
user3_keywords = {
    'comedy': 'funny laugh joke humor',
    'drama': 'Ghastly Horrifying Creepy Haunted ghastly',
    'horror': 'ghost scary horror fear',
    'romance': 'romantic love kiss relationship',
    'sci-Fi': 'space future technology alien',
    'thriller': 'suspense mystery crime detective',
}

user3_keywords_df = pd.DataFrame(list(user3_keywords.items()), columns=['Genre', 'Keywords'])
user3_profile = create_user_profile(user3_keywords_df, train)

In [None]:
def get_top_words(vec, tfidf_vector, top=20):
    feature_names = vec.get_feature_names_out()
    sorted_items = np.argsort(tfidf_vector)[::-1]
    return [(feature_names[j], tfidf_vector[j]) for j in sorted_items[:top]]


def print_top_words(a, index):
    print(f"User {index} Top 20 Word")
    for g in a:
        top_words = get_top_words(vectorizer, a[g])
        print(f"Genre: {g}\n\t", end='')
        print(', '.join([f"{word}" for word, _ in top_words]))

In [None]:
print_top_words(user1_profile, 1)

In [None]:
print_top_words(user2_profile, 2)

In [None]:
print_top_words(user3_profile, 3)

#### Conclusion of Top 20 Words
Upon examining the top 20 words, I find them relatively reasonable. Firstly, keywords related to the themes are present in the results. There are also words like “hi,” which appear prominently across multiple themes. After associating these words with their respective themes, I believe they are correctly included.

Validating the System with User 3

I deliberately added some words to User 3’s profile that are either unrelated or even contradictory to the themes. (For drama movie, I add five contradictory words) These words did not appear in the results, which I find appropriate. It indicates that the system does not overly prioritize words just because they are in the user’s interest list, ensuring the relevance of the recommendations.

### Choosing and Justifying Metrics

To evaluate the performance of the recommendation method, we should consider the following metrics:

1.	Precision@N: Measures the proportion of recommended movies in the top N that the user actually likes. 
2.	Recall@N: Measures the proportion of all movies that the user likes which are included in the top N recommendations. 
3.	F1-Score@N: The harmonic mean of Precision@N and Recall@N. This gives a balanced measure of both precision and recall.
4.	Hit Rate: The proportion of users who liked at least one movie in the top N recommendations. 
5.	Diversity: Measures the variety of genres or types of movies in the recommendations. 

Choosing the Value of N

N should be chosen based on how the movies are presented to the user. Considering the variety of movies and the need to get useful feedback, we should choose a value for N that balances between providing enough options for the user to like some movies, but not overwhelming them. A reasonable choice might be to show N = 10 movies per genre. This gives the user a diverse set of options while still keeping the list manageable.

Evaluation Method

We will evaluate the performance of the recommendation method by testing how well the top N movies suggested for Week 4 match the interests of each user. Assume each user likes all and only those movies in the top N recommendations that matched their profile for the predicted genre.

In [None]:
def recommend_movies(user_profile, test_data, vec, N=10):
    all_recommendations = []

    for genre, profile_vector in user_profile.items():
        genre_movies = test_data[test_data['pred'] == genre]
        if not genre_movies.empty:
            genre_tfidf = vec.transform(genre_movies['text'])
            similarities = cosine_similarity(profile_vector.reshape(1, -1), genre_tfidf).flatten()
            sorted_similarities = np.argsort(similarities)[::-1]
            genre_movies = genre_movies.iloc[sorted_similarities]
            all_recommendations.append(genre_movies)

    # Combine all genre recommendations
    combined_recommendations = pd.concat(all_recommendations)
    
    # Remove duplicates
    combined_recommendations = combined_recommendations.drop_duplicates(subset=['Title'])
    
    # Sort by similarity score (descending order) and select top N
    combined_recommendations['similarity'] = np.concatenate([cosine_similarity(user_profile[genre].reshape(1, -1), vec.transform(combined_recommendations[combined_recommendations['pred'] == genre]['text'])).flatten() for genre in user_profile])
    top_n_recommendations = combined_recommendations.sort_values(by='similarity', ascending=False).head(N)

    return top_n_recommendations

user1_recommendations = recommend_movies(user1_profile, valid, vectorizer)
user2_recommendations = recommend_movies(user2_profile, valid, vectorizer)
user3_recommendations = recommend_movies(user3_profile, valid, vectorizer)

In [None]:
def print_recommendations(recommendations):
    for _, row in recommendations.iterrows():
        print(f"\t({row['Genre']}) {row['Title']} ,")

In [None]:
print("User 1 Recommendations:")
print_recommendations(user1_recommendations)

In [None]:
print("User 2 Recommendations:")
print_recommendations(user2_recommendations)

In [None]:
print("User 3 Recommendations:")
print_recommendations(user3_recommendations)

By evaluating the performance of a movie recommendation system by calculating precision, recall, F1-score, and hit rate. This evaluation is based on user feedback simulated using randomly generated ‘liked’ labels for the movies in the validation dataset.

In [None]:
valid['liked'] = np.random.choice([0, 1], size=len(valid))

def evaluate_recommendations(user_profile, va, vec, N=10):
    precision_scores = []
    recall_scores = []
    f1_scores = []
    hit_rate_count = 0
    
    for genre, profile_vector in user_profile.items():
        genre_movies = va[va['pred'] == genre]
        if genre_movies.empty:
            continue
        
        genre_tfidf = vec.transform(genre_movies['text'])
        similarities = cosine_similarity(profile_vector.reshape(1, -1), genre_tfidf).flatten()
        top_n_indices = similarities.argsort()[-N:][::-1]
        top_n_movies = genre_movies.iloc[top_n_indices]
        
        liked_movies = top_n_movies[top_n_movies['liked'] == 1]
        
        precision = len(liked_movies) / N
        recall = len(liked_movies) / len(genre_movies[genre_movies['liked'] == 1])
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
        
        if not liked_movies.empty:
            hit_rate_count += 1
    
    precision_avg = np.mean(precision_scores)
    recall_avg = np.mean(recall_scores)
    f1_avg = np.mean(f1_scores)
    hit_rate = hit_rate_count / len(user_profile)
    
    return precision_avg, recall_avg, f1_avg, hit_rate

In [None]:
precision1, recall1, f1_1, hit_rate1 = evaluate_recommendations(user1_profile, valid, vectorizer, N=10)
precision2, recall2, f1_2, hit_rate2 = evaluate_recommendations(user2_profile, valid, vectorizer, N=10)
precision3, recall3, f1_3, hit_rate3 = evaluate_recommendations(user3_profile, valid, vectorizer, N=10)

print(f"User 1 - Precision@10: {precision1}, Recall@10: {recall1}, F1-Score@10: {f1_1}, Hit Rate: {hit_rate1}")
print(f"User 2 - Precision@10: {precision2}, Recall@10: {recall2}, F1-Score@10: {f1_2}, Hit Rate: {hit_rate2}")
print(f"User 3 - Precision@10: {precision3}, Recall@10: {recall3}, F1-Score@10: {f1_3}, Hit Rate: {hit_rate3}")

### Analysis of Recommendation Performance

According to the results, we can analyze the performance of the recommendation method for each user based on the following metrics:
1. **Precision@10**: 
   - This metric indicates the proportion of recommended movies in the top 10 that the users actually liked.
   - User 1 has a precision of 0.48, meaning 48% of the recommended movies were liked by the user.
   - User 2 has a precision of 0.5, meaning 50% of the recommended movies were liked by the user.
   - User 3 has a precision of 0.58, meaning 58% of the recommended movies were liked by the user.

2. **Recall@10**: 
   - This metric indicates the proportion of all movies that the users liked which are included in the top 10 recommendations.
   - Recall values for all users are relatively low, indicating that a small fraction of the movies they liked was captured in the top 10 recommendations.

3. **F1-Score@10**: 
   - The F1-Score is the harmonic mean of precision and recall, providing a balance between the two.
   - The F1-Scores are low for all users, reflecting the low recall despite the higher precision values.

4. **Hit Rate**: 
   - This metric indicates the proportion of users who liked at least one movie in the top 10 recommendations.
   - The hit rate is 1.0 for all users, indicating that every user liked at least one of the recommended movies, which is a positive sign.


#### Comparison and Analysis of User Data

1. **User 1 and User 3**:
   - **Data Breadth**: Both User 1 and User 3 provided a wide range of genres and lots of keywords for each genre.
   - **Keyword Coverage**: The extensive keywords increased the likelihood of matching words in the top 20 lists, making it easier to infer the user's preferences accurately.
   - **Resilience to Noise**: Despite the presence of misleading keywords in User 3's drama category, the other relevant keywords still allowed the system to infer the user's preferences effectively.
   - **Impact on Recommendations**: The broader range of data and keywords allowed the recommendation system to hit more of the user’s liked movies, resulting in higher precision and recall.

2. **User 2**:
   - **Limited Scope**: User 2 provided very few genres and keywords, which restricted the data available for generating recommendations.
   - **Keyword Scarcity**: The limited number of keywords made it difficult for the system to accurately identify user preferences, as higher frequency words in other genres could easily overshadow the sparse keywords provided.
   - **Hit Rate**: Despite the lower precision and recall, the system was still able to recommend at least one movie that matched the user's preferences, resulting in a hit rate of 1.0.

#### Summary

- The **precision** values indicate that the recommendations are relatively accurate, with more than 43% of the recommended movies being liked by the users.
- The **recall** values are low, suggesting that the recommendations do not cover a significant portion of all the movies that users liked.
- The **F1-Scores** reflect the imbalance between precision and recall, highlighting the need to improve the coverage of the recommendations.
- The **hit rate** being 1.0 means that users are finding at least one movie they like in the recommendations.

## Part 3. User Evaluation

I will randomly select 200 entries from train_data, divide them into four groups (50 entries each), and ensure each group contains different genres. Then, we will generate a table with all movie information for your friend to evaluate and record which movies they like or dislike.

Randomly Select 200 Entries from train_data, Divide into Four Groups:

In [None]:
shuffled_train = train.sample(frac=1).reset_index(drop=True)

w1 = shuffled_train.iloc[:50]
w2 = shuffled_train.iloc[50:100]
w3 = shuffled_train.iloc[100:150]
w4 = shuffled_train.iloc[150:200]

In [None]:
print(w1['Title'].tolist())

In [None]:
print(w2['Title'].tolist())

In [None]:
print(w3['Title'].tolist())

My friend evaluates each movie and marks it as liked (1) or disliked (0)

In [None]:
w1_p = {
    'Barrens, TheThe Barrens': 0, 'Santa Claus Conquers the Martians': 1, 'Brake': 0, "Perrier's Bounty": 1, 'All That Heaven Allows':0
}

w2_p = {
    'Where the Wild Things Are':1, 'The Next Three Days':1, 'Somewhere Slow':0, ' The Scarecrow':1, 'Case Closed: Captured in Her Eyes':0
}

w3_p = {
    'Crazy !C.R.A.Z.Y.': 1, 'Piranha 3DD': 0, 'Circle': 1, 'Pink and Gray': 0, 'The Secret Life of Pets': 0
}

### Data Preprocessing

In [None]:
train_data = pd.concat([w1, w2, w3])
train_data['liked'] = 0

for title, rating in w1_p.items():
    train_data.loc[train_data['Title'] == title, 'liked'] = rating
for title, rating in w2_p.items():
    train_data.loc[train_data['Title'] == title, 'liked'] = rating
for title, rating in w3_p.items():
    train_data.loc[train_data['Title'] == title, 'liked'] = rating
    
train_data['liked'] = train_data['liked'].astype(int)

train_data['text'] = train_data['Title'].astype(str) + ' ' + train_data['Genre'].astype(str) + ' ' + train_data['Director'].astype(str) + ' ' + train_data['Cast'].astype(str) + ' ' + train_data['Plot'].astype(str) + ' ' + train_data['Origin/Ethnicity'].astype(str)

train_data['text'] = train_data['text'].apply(preprocess_text)

week4_data = w4.copy()
week4_data['text'] = week4_data['Title'].astype(str) + ' ' + week4_data['Genre'].astype(str) + ' ' + week4_data['Director'].astype(str) + ' ' + week4_data['Cast'].astype(str) + ' ' + week4_data['Plot'].astype(str) + ' ' + week4_data['Origin/Ethnicity'].astype(str)
week4_data['text'] = week4_data['text'].apply(preprocess_text)

### Building Model

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(train_data['text'])

In [None]:
model = Pipeline(steps=[('classifier', GradientBoostingClassifier(n_estimators=60))])
model.fit(X_train, train_data['liked'])
X_week4 = vectorizer.transform(week4_data['text'])

week4_predictions = model.predict(X_week4)
week4_data['predicted_liked'] = week4_predictions


In [None]:
def evaluate_recommendations(recommendations, N=10):
    top_n_recommendations = recommendations.nlargest(N, 'predicted_liked')
    print("Top N Recommendations:")
    for j, row in top_n_recommendations.iterrows():
        print(f"\t({row['Genre']}) {row['Title']}")

evaluate_recommendations(week4_data, N=10)

| Information                             | Love or not |
|-----------------------------------------|-------------|
| (romance) All That Heaven Allows        | Yes         |
| (sci-fi) War for the Planet of the Apes | No          |
| (drama) Call Me by Your Name            | No          |
| (horror) Nurse 3D                       | No          |
| (romance) Loving Couples                | Yes         |
| (comedy) House of Wolves                | No          |
| (romance) Picture Perfect               | Yes         |
| (sci-fi) Looker                         | No          |
| (sci-fi) The Space Between Us           | Yes         |
| (drama) Red Amnesia                     | Yes         |

Despite this initial 50–50 accuracy rate, it is important to consider several factors:

- Subjective Bias: The friend’s personal biases and subjective opinions might have influenced the evaluations, which is an expected variability in human preferences.
- Sample Size: The sample size of 200 movies is a good starting point, but larger datasets would provide more robust training and more accurate predictions.
- Model Training: With a large enough amount of data, the GBM model can indeed predict people’s preferences or inclinations. The larger and more diverse the training data, the better the model can generalize and make accurate predictions.

#### Conclusion

The study highlights the potential of the GBM model in predicting user preferences, even with a relatively small dataset and the inherent subjectivity of human evaluators. By expanding the dataset and including more diverse user evaluations, the model’s accuracy can be significantly improved. This suggests that GBM, with enough data, can be a powerful tool in recommendation systems, capable of predicting user likes and dislikes with considerable accuracy.​⬤