# Content-based Filtering

## Dependencies

In [1]:
# preprocessing
import pandas as pd
import re
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
nltk.data.path.append("/home/albot/coding/repos/Machine-learning-AI24/data/movielens/nltk_data/")
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# modelling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [2]:
# # RUN DOWNLOADS ONLY ONCE
# nltk.download("vader_lexicon")
# nltk.download("stopwords")

## Preprocessing

### Movies

In [3]:
movies = pd.read_csv("../data/movielens/movies.csv")

In [4]:
movies.head(2)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [5]:
movies["genres"] = movies["genres"].replace("(no genres listed)", np.nan)
movies["genres"] = movies["genres"].str.lower().str.replace("|", " ").str.replace("-", "").str.strip()
movies["year"] = movies["title"].str.extract(r"\((\d{4})\)")
movies["title"] = movies["title"].str.replace(r'\[.*?\]|\(.*?\)', '', regex=True).str.strip()
movies.dropna(how="any", inplace=True) # dropping 7300 NaN values

In [6]:
movies.head(2)

Unnamed: 0,movie_id,title,genres,year
0,1,Toy Story,adventure animation children comedy fantasy,1995
1,2,Jumanji,adventure children fantasy,1995


In [7]:
movies_gen = movies.copy()
movies_gen = movies_gen.drop(columns=["title", "year"])

In [8]:
movies_gen.head(2)

Unnamed: 0,movie_id,genres
0,1,adventure animation children comedy fantasy
1,2,adventure children fantasy


### Tags

In [9]:
stop_words = stopwords.words("english")
more_stopwords = []  # add internet slang, etc.
# stop_words = stop_words + more_stopwords
stemmer = nltk.SnowballStemmer("english")


# Special thanks to https://www.kaggle.com/tanulsingh077 for this function
def clean_text(text):
    """Make text lowercase, remove text in square brackets, remove links, remove punctuation
    and remove words containing numbers."""
    text = str(text).lower()
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    text = re.sub(r"<.*?>+", "", text)
    text = re.sub(r"[%s]" % re.escape(string.punctuation), "", text)
    text = re.sub(r"\n", "", text)
    text = re.sub(r"\w*\d\w*", "", text)
    return text


def remove_stopwords(text):
    text = " ".join(word for word in text.split(" ") if word not in stop_words)
    return text


def stem_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

In [10]:
tags = pd.read_csv("../data/movielens/tags.csv")
movies_tag = tags.copy()
movies_tag = movies_tag.drop(columns=["user_id", "timestamp"])
movies_tag = movies_tag.groupby('movie_id')['tag'].apply(lambda x: ' '.join(x.astype(str))).reset_index()
movies_tag["tag"] = movies_tag["tag"].apply(remove_stopwords)
movies_tag["tag"] = movies_tag["tag"].apply(clean_text)
movies_tag["tag"] = movies_tag["tag"].apply(stem_text)

In [11]:
movies_tag

Unnamed: 0,movie_id,tag
0,1,anim friendship toy anim disney pixar toy cgi ...
1,2,anim base book fantasi magic board game monkey...
2,3,sequel moldi old old age old men wed old peopl...
3,4,charact chick flick girl movi charact chick fl...
4,5,famili pregnanc wed wall age babi daughter di...
...,...,...
53447,288765,postapocalypt surviv tw suicid apocalyps bad s...
53448,288779,don camillo seri
53449,288849,addict anim short film
53450,288937,anim


### Combining tags and movies

In [12]:
movies_gentag = movies_gen.merge(movies_tag, on="movie_id", how="left")
movies_gentag["semantics"] = movies_gentag.apply(lambda x: f"{x["genres"]} {x["tag"]}" if pd.notna(x["tag"]) else x["genres"], axis=1)
movies_gentag = movies_gentag.drop(columns=["genres", "tag"])

In [13]:
movies_gentag

Unnamed: 0,movie_id,semantics
0,1,adventure animation children comedy fantasy an...
1,2,adventure children fantasy anim base book fant...
2,3,comedy romance sequel moldi old old age old me...
3,4,comedy drama romance charact chick flick girl ...
4,5,comedy famili pregnanc wed wall age babi daug...
...,...,...
79232,288967,action drama
79233,288971,action horror
79234,288975,documentary
79235,288977,crime thriller


### Dimensionality reduction

?

### TF-IDF

In [14]:
tfidf = TfidfVectorizer(min_df=2)
tfidf_matrix = tfidf.fit_transform(movies_gentag["semantics"])
tfidf_vocab = tfidf.get_feature_names_out()

In [15]:
def get_recommendations(movie_id, n=5):
    idx = movies_gentag[movies_gentag["movie_id"] == movie_id].index[0]
    distances, indices = knn.kneighbors(tfidf_matrix[idx], n_neighbors=n+1)
    return movies_gentag.iloc[indices[0][1:]]

In [19]:
knn = NearestNeighbors(
    n_neighbors=6,
    metric="cosine",
    algorithm="brute"
)
knn.fit(tfidf_matrix)
temp = get_recommendations(344, 5)
temp = temp["movie_id"].tolist()
temp = movies[movies["movie_id"].isin(temp)]
temp = temp["title"].tolist()
for i in temp:
    print(i)

Ace Ventura: When Nature Calls
Dumb & Dumber
Mask, The
Liar Liar
Me, Myself & Irene


### LDA

In [17]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

cvec_vocab = CountVectorizer(vocabulary=tfidf.vocabulary_)
vocab_matrix = cvec_vocab.fit_transform(movies_gentag["semantics"])

lda = LatentDirichletAllocation(
    n_components=20,
    max_iter=10,
    learning_method='online'
)
movie_topic_distributions = lda.fit_transform(vocab_matrix)

In [18]:
topic_df = pd.DataFrame(
    movie_topic_distributions, 
    index=movies_gentag['movie_id']
).reset_index()

In [19]:
feature_names = tfidf.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    top_words_idx = topic.argsort()[:-11:-1]  # Get indices of top 10 words
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"Topic {topic_idx}: {' '.join(top_words)}")

Topic 0: age femal come school gay romanc high lead strong teenag
Topic 1: visual surreal appeal nuditi cinematographi atmospher beauti topless realiti altern
Topic 2: time travel action art polit postapocalypt martial spi car apocalyps
Topic 3: stori true base oscar best great pictur act inspir tom
Topic 4: death horror man murder shot woman femal charact car zombi
Topic 5: dark film social black commentari cult satir quirki noir white
Topic 6: relationship refer sex nuditi woman femal film male father mother
Topic 7: anim children animation comedy adventure fantasy disney short famili christma
Topic 8: horror end twist psycholog thriller mysteri suspens plot mental killer
Topic 9: crime thriller drama action mystery polic murder reveng prison corrupt
Topic 10: new thoughtprovok york citi dialogu philosophi philosoph driven russel smart
Topic 11: scifi space alien scienc intellig futur dystopia artifici robot travel
Topic 12: relationship famili emot bittersweet life love depress dram

In [20]:
topic_df

Unnamed: 0,movie_id,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,1,0.005246,0.001295,0.026599,0.108583,0.090583,0.000026,0.047736,0.502813,0.000026,...,0.001574,0.006249,0.002641,0.154539,0.039604,0.000026,0.004207,0.000727,0.006436,0.001062
1,2,0.022383,0.000045,0.117607,0.005098,0.198297,0.002464,0.071008,0.109638,0.000045,...,0.000045,0.002517,0.003081,0.011741,0.062544,0.000045,0.013748,0.010193,0.369414,0.000045
2,3,0.033043,0.000820,0.000820,0.053293,0.050435,0.000820,0.164099,0.000820,0.000820,...,0.378868,0.000820,0.000820,0.126785,0.051542,0.000820,0.000820,0.113549,0.000820,0.019370
3,4,0.450037,0.002000,0.002000,0.002000,0.002000,0.002000,0.217395,0.002000,0.002000,...,0.002000,0.002000,0.002000,0.002000,0.002000,0.002000,0.002000,0.190348,0.110220,0.002000
4,5,0.032472,0.000495,0.000495,0.000495,0.000495,0.000495,0.198269,0.000495,0.000495,...,0.063194,0.000495,0.206060,0.418551,0.000495,0.023666,0.000495,0.000495,0.051352,0.000495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79232,288967,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,...,0.016667,0.016667,0.016667,0.016667,0.350000,0.016667,0.016667,0.350000,0.016667,0.016667
79233,288971,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.350000,...,0.016667,0.016667,0.016667,0.016667,0.350000,0.016667,0.016667,0.016667,0.016667,0.016667
79234,288975,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,...,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,0.525000
79235,288977,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,...,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667


### LSA

In [20]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=20)
lsa = svd.fit_transform(tfidf_matrix)

In [21]:
lsa_tfidf_df = pd.DataFrame(
    lsa,
    index=movies_gentag["movie_id"],
    columns=[f"topic_{i}" for i in range(20)]
)
lsa_tfidf_df

Unnamed: 0_level_0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.010467,0.008603,0.001645,0.010171,0.001096,-0.015361,0.066063,-0.022555,0.002147,0.060716,0.001570,0.026296,-0.013870,-0.014571,0.029628,0.006542,0.145076,-0.066901,0.031462,-0.066254
2,0.009144,0.004357,0.001980,0.020720,0.003380,0.000479,0.041285,0.000624,0.000607,0.037076,0.022250,0.040199,-0.006462,0.004420,0.020473,0.011465,0.132884,-0.058629,-0.021109,-0.096284
3,0.025716,0.032206,-0.001016,-0.001024,0.036179,0.002528,-0.000846,0.000787,0.002445,0.001764,0.004660,0.015921,-0.001002,0.003337,0.009439,0.002077,0.065313,-0.029090,0.027197,-0.023695
4,0.097502,0.037161,-0.003469,-0.011820,0.068959,0.007480,-0.002176,-0.000491,0.003117,0.003132,0.007301,0.027638,-0.001062,0.006927,0.018138,0.003327,0.106745,-0.045062,-0.035105,-0.108186
5,0.012577,0.016782,0.000144,0.002589,-0.001603,-0.000305,0.002448,0.000613,0.002171,0.006351,0.006137,0.028695,-0.001636,0.004006,0.012642,0.004215,0.098431,-0.045149,0.025359,-0.053894
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288967,0.504790,-0.255808,-0.007581,0.193000,-0.022367,-0.402980,0.194619,0.579540,-0.187268,-0.071348,-0.169475,-0.005313,0.026108,-0.178153,-0.024510,0.042925,-0.017187,-0.025300,0.000452,0.001304
288971,0.097491,0.018338,0.002927,0.612686,0.021545,0.191931,0.301994,0.644114,-0.053065,-0.054036,-0.213075,0.003816,-0.011663,-0.130229,-0.006041,0.015172,-0.028415,-0.016985,0.000891,0.008062
288975,0.028118,0.022348,0.998436,-0.010612,0.002509,0.002319,-0.008292,0.000981,-0.006160,-0.003641,-0.004724,-0.033518,0.001944,-0.002997,-0.005783,0.000949,-0.001055,0.000925,0.001389,-0.001975
288977,0.131046,-0.056760,0.005540,0.625000,0.026986,-0.367987,-0.296535,-0.300350,0.456300,0.022649,0.007691,-0.028666,-0.206549,0.136139,-0.014501,0.007179,-0.030084,0.021825,0.010413,0.010164


In [22]:
lsa_tfidf_df.shape

(79237, 20)

COherence score med coherencemodel?? Fast LSA i stället för NMF i nedan exempel. 

In [None]:
# Create a list of the topic numbers we want to try
topic_nums = list(np.arange(1, 10, 1))

# Run the nmf model and calculate the coherence score
# for each number of topics
coherence_scores = []
for num in topic_nums:
    # NMF model initialization
    nmf = Nmf(
        corpus=corpus,
        num_topics=num,
        id2word=dictionary,
        chunksize=2000,
        passes=5,
        kappa=0.1,
        minimum_probability=0.01,
        w_max_iter=300,
        w_stop_condition=0.0001,
        h_max_iter=100,
        h_stop_condition=0.001,
        eval_every=10,
        normalize=True,
        random_state=42
    )
    
    # Coherence model calculation
    cm = CoherenceModel(
        model=nmf,
        texts=sep_prepr_titles,
        dictionary=dictionary,
        coherence='c_v'
    )
    
    # Appending coherence scores to the list
    coherence_scores.append(round(cm.get_coherence(), 5))

The coherence score is a crucial metric in topic modeling that quantifies the quality and interpretability of topics generated by the NMF. It serves as an objective measure to assess the semantic consistency and relevance of words within a topic. By comparing the pairwise semantic similarity of the top words in a topic, coherence scores indicate whether the terms are closely related in meaning. Higher scores signify more coherent and interpretable topics, while lower scores suggest mixed or less focused topics.

In [None]:
# Get the number of topics with the highest coherence score
scores = list(zip(topic_nums, coherence_scores))
best_num_topics = sorted(scores, key=itemgetter(1), reverse=True)[0][0]

# Plot the results
fig = plt.figure(figsize=(15, 7))

plt.plot(
    topic_nums,
    coherence_scores,
    linewidth=3,
    color='#4287f5'
)
plt.xlabel("Topic Num", fontsize=14)
plt.ylabel("Coherence Score", fontsize=14)
plt.title('Coherence Score by Topic Number - Best Number of Topics: {}'.format(best_num_topics), fontsize=18)
plt.xticks(np.arange(5, max(topic_nums) + 1, 5), fontsize=12)
plt.yticks(fontsize=12)
plt.show()