# Dependencies

In [3]:
# preprocessing
import pandas as pd
import re
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
nltk.data.path.append("/home/albot/coding/repos/Machine-learning-AI24/data/movielens/nltk_data/")
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# modelling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [4]:
# # RUN DOWNLOADS ONLY ONCE
# nltk.download("vader_lexicon")
# nltk.download("stopwords")

# Preprocessing

## Movies

In [5]:
movies = pd.read_csv("../data/movielens/movies.csv")

In [6]:
movies.head(2)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [7]:
movies["genres"] = movies["genres"].replace("(no genres listed)", np.nan)
movies["genres"] = movies["genres"].str.lower().str.replace("|", " ").str.replace("-", "").str.strip()
movies["year"] = movies["title"].str.extract(r"\((\d{4})\)")
movies["title"] = movies["title"].str.replace(r'\[.*?\]|\(.*?\)', '', regex=True).str.strip()
movies.dropna(how="any", inplace=True) # dropping 7300 NaN values

In [8]:
movies.head(2)

Unnamed: 0,movie_id,title,genres,year
0,1,Toy Story,adventure animation children comedy fantasy,1995
1,2,Jumanji,adventure children fantasy,1995


In [9]:
movies_gen = movies.copy()
movies_gen = movies_gen.drop(columns=["title", "year"])

In [10]:
movies_gen.head(2)

Unnamed: 0,movie_id,genres
0,1,adventure animation children comedy fantasy
1,2,adventure children fantasy


## Tags

In [11]:
stop_words = stopwords.words("english")
more_stopwords = []  # add internet slang, etc.
# stop_words = stop_words + more_stopwords
stemmer = nltk.SnowballStemmer("english")


# Special thanks to https://www.kaggle.com/tanulsingh077 for this function
def clean_text(text):
    """Make text lowercase, remove text in square brackets, remove links, remove punctuation
    and remove words containing numbers."""
    text = str(text).lower()
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    text = re.sub(r"<.*?>+", "", text)
    text = re.sub(r"[%s]" % re.escape(string.punctuation), "", text)
    text = re.sub(r"\n", "", text)
    text = re.sub(r"\w*\d\w*", "", text)
    return text


def remove_stopwords(text):
    text = " ".join(word for word in text.split(" ") if word not in stop_words)
    return text


def stem_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

In [12]:
tags = pd.read_csv("../data/movielens/tags.csv")
movies_tag = tags.copy()
movies_tag = movies_tag.drop(columns=["user_id", "timestamp"])
movies_tag = movies_tag.groupby('movie_id')['tag'].apply(lambda x: ' '.join(x.astype(str))).reset_index()
movies_tag["tag"] = movies_tag["tag"].apply(remove_stopwords)
movies_tag["tag"] = movies_tag["tag"].apply(clean_text)
movies_tag["tag"] = movies_tag["tag"].apply(stem_text)

In [13]:
movies_tag

Unnamed: 0,movie_id,tag
0,1,anim friendship toy anim disney pixar toy cgi ...
1,2,anim base book fantasi magic board game monkey...
2,3,sequel moldi old old age old men wed old peopl...
3,4,charact chick flick girl movi charact chick fl...
4,5,famili pregnanc wed wall age babi daughter di...
...,...,...
53447,288765,postapocalypt surviv tw suicid apocalyps bad s...
53448,288779,don camillo seri
53449,288849,addict anim short film
53450,288937,anim


## Combining tags and movies

In [14]:
movies_gentag = movies_gen.merge(movies_tag, on="movie_id", how="left")
movies_gentag["semantics"] = movies_gentag.apply(lambda x: f"{x["genres"]} {x["tag"]}" if pd.notna(x["tag"]) else x["genres"], axis=1)
movies_gentag = movies_gentag.drop(columns=["genres", "tag"])

In [15]:
movies_gentag

Unnamed: 0,movie_id,semantics
0,1,adventure animation children comedy fantasy an...
1,2,adventure children fantasy anim base book fant...
2,3,comedy romance sequel moldi old old age old me...
3,4,comedy drama romance charact chick flick girl ...
4,5,comedy famili pregnanc wed wall age babi daug...
...,...,...
79232,288967,action drama
79233,288971,action horror
79234,288975,documentary
79235,288977,crime thriller


## Dimensionality reduction

?

## TF-IDF

In [16]:
tfidf = TfidfVectorizer(min_df=2)
semantics = tfidf.fit_transform(movies_gentag["semantics"])
features = tfidf.get_feature_names_out()

In [17]:
def get_recommendations(movie_id, n=5):
    idx = movies_gentag[movies_gentag["movie_id"] == movie_id].index[0]
    distances, indices = knn.kneighbors(semantics[idx], n_neighbors=n+1)
    return movies_gentag.iloc[indices[0][1:]]

In [18]:
knn = NearestNeighbors(
    n_neighbors=6,
    metric="cosine",
    algorithm="brute"
)
knn.fit(semantics)
temp = get_recommendations(1, 5)
temp = temp["movie_id"].tolist()
temp = movies[movies["movie_id"].isin(temp)]
temp = temp["title"].tolist()
for i in temp:
    print(i)

Bug's Life, A
Toy Story 2
Finding Nemo
Toy Story 3
Finding Dory


## LDA

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(vocabulary=tfidf.vocabulary_)
count_matrix = count_vectorizer.fit_transform(movies_gentag["semantics"])

lda = LatentDirichletAllocation(
    n_components=20,
    max_iter=10,
    learning_method='online'
)
movie_topic_distributions = lda.fit_transform(count_matrix)

In [None]:
topic_df = pd.DataFrame(
    movie_topic_distributions, 
    index=movies_gentag['movie_id']
).reset_index()

In [None]:
feature_names = tfidf.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    top_words_idx = topic.argsort()[:-11:-1]  # Get indices of top 10 words
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"Topic {topic_idx}: {' '.join(top_words)}")

Topic 0: crime thriller polic drama drug action reveng violenc noir robert
Topic 1: anim children animation comedy fantasy adventure disney famili short christma
Topic 2: action war drama art fight adventure martial sword militari ryan
Topic 3: bad action car game plot sequel intellig comput predict video
Topic 4: age school come romanc gay femal high teenag friendship movi
Topic 5: refer relationship man femal death woman titl charact male shot
Topic 6: horror thriller mystery killer murder drama serial suspens monster supernatur
Topic 7: end twist plot mysteri thriller suspens mindfuck clever great script
Topic 8: base book western spi novel england jame japan british espionag
Topic 9: scifi alien dystopia scienc futur artifici robot postapocalypt surviv apocalyps
Topic 10: psycholog time travel thoughtprovok mental ill disturb loop bruce mindfuck
Topic 11: best oscar great act classic pictur inspir cinematographi emot bittersweet
Topic 12: comedi comedy funni humor dark black satir 

In [24]:
topic_df

Unnamed: 0,movie_id,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,1,0.000759,0.496701,0.008637,0.025874,0.011835,0.134649,0.007886,0.000026,0.001047,...,0.008767,0.044592,0.126455,0.120974,0.001911,0.000026,0.006353,0.000026,0.000026,0.000818
1,2,0.005144,0.143153,0.000045,0.085996,0.175865,0.246460,0.057536,0.000045,0.011392,...,0.062839,0.013569,0.006969,0.184194,0.000045,0.000045,0.002588,0.002865,0.000045,0.001163
2,3,0.031587,0.000820,0.238888,0.000820,0.099298,0.163369,0.000820,0.000820,0.000820,...,0.000820,0.069049,0.257968,0.000820,0.000820,0.000820,0.112937,0.017068,0.000820,0.000820
3,4,0.043838,0.002000,0.002000,0.002000,0.590157,0.002000,0.002000,0.002000,0.152831,...,0.002000,0.002000,0.002000,0.002000,0.002000,0.002000,0.002000,0.181174,0.002000,0.002000
4,5,0.107979,0.059186,0.000495,0.018554,0.031720,0.023341,0.000495,0.000495,0.000495,...,0.000495,0.095361,0.281395,0.000495,0.000495,0.000495,0.012317,0.364702,0.000495,0.000495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79232,288967,0.016667,0.016667,0.683333,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,...,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667
79233,288971,0.016667,0.016667,0.350000,0.016667,0.016667,0.016667,0.350000,0.016667,0.016667,...,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667
79234,288975,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,...,0.025000,0.025000,0.025000,0.025000,0.525000,0.025000,0.025000,0.025000,0.025000,0.025000
79235,288977,0.683333,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,...,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667
