# Dependencies

In [1]:
# preprocessing
import pandas as pd
import re
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
nltk.data.path.append("/home/albot/coding/repos/Machine-learning-AI24/data/movielens/nltk_data/")
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# modelling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [2]:
# # RUN DOWNLOADS ONLY ONCE
# nltk.download("vader_lexicon")
# nltk.download("stopwords")

# Preprocessing

## Movies

In [3]:
movies = pd.read_csv("../data/movielens/movies.csv")

In [4]:
movies.head(2)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [5]:
movies["genres"] = movies["genres"].replace("(no genres listed)", np.nan)
movies["genres"] = movies["genres"].str.lower().str.replace("|", " ").str.replace("-", "").str.strip()
movies["year"] = movies["title"].str.extract(r"\((\d{4})\)")
movies["title"] = movies["title"].str.replace(r'\[.*?\]|\(.*?\)', '', regex=True).str.strip()
movies.dropna(how="any", inplace=True) # dropping 7300 NaN values

In [6]:
movies.head(2)

Unnamed: 0,movie_id,title,genres,year
0,1,Toy Story,adventure animation children comedy fantasy,1995
1,2,Jumanji,adventure children fantasy,1995


In [7]:
movies_gen = movies.copy()
movies_gen = movies_gen.drop(columns=["title", "year"])

In [8]:
movies_gen.head(2)

Unnamed: 0,movie_id,genres
0,1,adventure animation children comedy fantasy
1,2,adventure children fantasy


## Tags

In [9]:
stop_words = stopwords.words("english")
more_stopwords = []  # add internet slang, etc.
# stop_words = stop_words + more_stopwords
stemmer = nltk.SnowballStemmer("english")


# Special thanks to https://www.kaggle.com/tanulsingh077 for this function
def clean_text(text):
    """Make text lowercase, remove text in square brackets, remove links, remove punctuation
    and remove words containing numbers."""
    text = str(text).lower()
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    text = re.sub(r"<.*?>+", "", text)
    text = re.sub(r"[%s]" % re.escape(string.punctuation), "", text)
    text = re.sub(r"\n", "", text)
    text = re.sub(r"\w*\d\w*", "", text)
    return text


def remove_stopwords(text):
    text = " ".join(word for word in text.split(" ") if word not in stop_words)
    return text


def stem_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

In [10]:
tags = pd.read_csv("../data/movielens/tags.csv")
movies_tag = tags.copy()
movies_tag = movies_tag.drop(columns=["user_id", "timestamp"])
movies_tag = movies_tag.groupby('movie_id')['tag'].apply(lambda x: ' '.join(x.astype(str))).reset_index()
movies_tag["tag"] = movies_tag["tag"].apply(remove_stopwords)
movies_tag["tag"] = movies_tag["tag"].apply(clean_text)
movies_tag["tag"] = movies_tag["tag"].apply(stem_text)

In [11]:
movies_tag

Unnamed: 0,movie_id,tag
0,1,anim friendship toy anim disney pixar toy cgi ...
1,2,anim base book fantasi magic board game monkey...
2,3,sequel moldi old old age old men wed old peopl...
3,4,charact chick flick girl movi charact chick fl...
4,5,famili pregnanc wed wall age babi daughter di...
...,...,...
53447,288765,postapocalypt surviv tw suicid apocalyps bad s...
53448,288779,don camillo seri
53449,288849,addict anim short film
53450,288937,anim


## Combining tags and movies

In [12]:
movies_gentag = movies_gen.merge(movies_tag, on="movie_id", how="left")
movies_gentag["semantics"] = movies_gentag.apply(lambda x: f"{x["genres"]} {x["tag"]}" if pd.notna(x["tag"]) else x["genres"], axis=1)
movies_gentag = movies_gentag.drop(columns=["genres", "tag"])

In [13]:
movies_gentag

Unnamed: 0,movie_id,semantics
0,1,adventure animation children comedy fantasy an...
1,2,adventure children fantasy anim base book fant...
2,3,comedy romance sequel moldi old old age old me...
3,4,comedy drama romance charact chick flick girl ...
4,5,comedy famili pregnanc wed wall age babi daug...
...,...,...
79232,288967,action drama
79233,288971,action horror
79234,288975,documentary
79235,288977,crime thriller


## Dimensionality reduction

?

## TF-IDF

In [14]:
tfidf = TfidfVectorizer(min_df=2)
tfidf_matrix = tfidf.fit_transform(movies_gentag["semantics"])
tfidf_vocab = tfidf.get_feature_names_out()

In [15]:
def get_recommendations(movie_id, n=5):
    idx = movies_gentag[movies_gentag["movie_id"] == movie_id].index[0]
    distances, indices = knn.kneighbors(tfidf_matrix[idx], n_neighbors=n+1)
    return movies_gentag.iloc[indices[0][1:]]

In [16]:
knn = NearestNeighbors(
    n_neighbors=6,
    metric="cosine",
    algorithm="brute"
)
knn.fit(tfidf_matrix)
temp = get_recommendations(1, 5)
temp = temp["movie_id"].tolist()
temp = movies[movies["movie_id"].isin(temp)]
temp = temp["title"].tolist()
for i in temp:
    print(i)

Bug's Life, A
Toy Story 2
Finding Nemo
Toy Story 3
Finding Dory


## LDA

In [17]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

cvec_vocab = CountVectorizer(vocabulary=tfidf.vocabulary_)
vocab_matrix = cvec_vocab.fit_transform(movies_gentag["semantics"])

lda = LatentDirichletAllocation(
    n_components=20,
    max_iter=10,
    learning_method='online'
)
movie_topic_distributions = lda.fit_transform(vocab_matrix)

In [18]:
topic_df = pd.DataFrame(
    movie_topic_distributions, 
    index=movies_gentag['movie_id']
).reset_index()

In [19]:
feature_names = tfidf.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    top_words_idx = topic.argsort()[:-11:-1]  # Get indices of top 10 words
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"Topic {topic_idx}: {' '.join(top_words)}")

Topic 0: crime thriller action drama western polic reveng drug robert violenc
Topic 1: nuditi sex topless gay femal sexual frontal full drug lesbian
Topic 2: relationship music famili love drama life father child daughter mother
Topic 3: documentari movi sport michael multipl storylin tv john busi footag
Topic 4: horror thriller murder mystery killer zombi serial suspens mysteri supernatur
Topic 5: new polit york mental citi ill conspiraci journal woodi corrupt
Topic 6: documentary christma steven island spielberg musician treasur hunt disast cage
Topic 7: anim children animation fantasi adventure fantasy magic adventur marvel action
Topic 8: base book oscar best christian pictur prison drama adapt novel
Topic 9: drama comedy romance director woman romanc love romant bdr ryan
Topic 10: film social commentari black white french independ franc road movi
Topic 11: scifi space time travel alien scienc action futur bad robot
Topic 12: end twist psycholog plot great mindfuck mysteri nonlinea

In [20]:
topic_df

Unnamed: 0,movie_id,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,1,0.000026,0.000026,0.031419,0.006758,0.005171,0.006499,0.002024,0.472110,0.000026,...,0.012359,0.024183,0.000026,0.003014,0.150791,0.000026,0.002559,0.118264,0.158832,0.005294
1,2,0.000045,0.002400,0.005479,0.000045,0.025841,0.002634,0.000045,0.188918,0.026789,...,0.000045,0.119538,0.000045,0.031009,0.363904,0.000045,0.000045,0.012609,0.211642,0.008880
2,3,0.039279,0.000820,0.000820,0.545719,0.000820,0.000820,0.000820,0.000820,0.000820,...,0.000820,0.000820,0.000820,0.000820,0.176565,0.000820,0.000820,0.071807,0.092743,0.000820
3,4,0.002000,0.002000,0.173016,0.002000,0.002000,0.002000,0.002000,0.002000,0.170143,...,0.002000,0.002000,0.002000,0.002000,0.126653,0.002000,0.002000,0.002000,0.062627,0.002000
4,5,0.131335,0.000495,0.394944,0.000495,0.000495,0.000495,0.000495,0.045469,0.000495,...,0.000495,0.000495,0.000495,0.000495,0.013265,0.000495,0.000495,0.242618,0.039117,0.000495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79232,288967,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,...,0.016667,0.016667,0.016667,0.016667,0.016667,0.350000,0.016667,0.016667,0.016667,0.016667
79233,288971,0.016667,0.016667,0.016667,0.016667,0.350000,0.016667,0.016667,0.016667,0.016667,...,0.016667,0.016667,0.016667,0.016667,0.016667,0.350000,0.016667,0.016667,0.016667,0.016667
79234,288975,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,0.525000,0.025000,0.025000,...,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000
79235,288977,0.683333,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,...,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667


## LSA

In [21]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=20)
lsa = svd.fit_transform(tfidf_matrix)

In [22]:
lsa_tfidf_df = pd.DataFrame(
    lsa,
    index=movies_gentag["movie_id"],
    columns=[f"topic_{i}" for i in range(20)]
)
lsa_tfidf_df

Unnamed: 0_level_0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.010467,0.008603,0.001645,0.010171,0.001096,-0.015361,0.066062,-0.022555,0.002147,0.060710,0.001573,0.026294,-0.013863,-0.014583,0.029596,0.006486,0.144650,-0.067236,0.030730,-0.066995
2,0.009144,0.004357,0.001980,0.020720,0.003380,0.000478,0.041285,0.000624,0.000609,0.037073,0.022255,0.040191,-0.006455,0.004426,0.020452,0.011410,0.132649,-0.058776,-0.021237,-0.096420
3,0.025716,0.032206,-0.001016,-0.001024,0.036179,0.002528,-0.000846,0.000787,0.002445,0.001764,0.004659,0.015924,-0.001000,0.003339,0.009445,0.002075,0.065315,-0.029040,0.027043,-0.023721
4,0.097502,0.037161,-0.003469,-0.011820,0.068959,0.007480,-0.002176,-0.000491,0.003117,0.003131,0.007301,0.027636,-0.001058,0.006934,0.018131,0.003325,0.106712,-0.045107,-0.034985,-0.108144
5,0.012577,0.016782,0.000144,0.002589,-0.001603,-0.000305,0.002448,0.000612,0.002171,0.006352,0.006137,0.028699,-0.001633,0.004019,0.012648,0.004242,0.098481,-0.045044,0.025416,-0.053627
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288967,0.504790,-0.255808,-0.007581,0.193000,-0.022367,-0.402980,0.194619,0.579540,-0.187268,-0.071348,-0.169474,-0.005313,0.026109,-0.178151,-0.024508,0.042930,-0.017183,-0.025283,0.000525,0.001456
288971,0.097491,0.018338,0.002927,0.612686,0.021545,0.191931,0.301994,0.644114,-0.053065,-0.054036,-0.213074,0.003816,-0.011662,-0.130226,-0.006041,0.015176,-0.028422,-0.016970,0.000935,0.008119
288975,0.028118,0.022348,0.998436,-0.010612,0.002509,0.002319,-0.008292,0.000981,-0.006160,-0.003641,-0.004724,-0.033518,0.001944,-0.002998,-0.005783,0.000950,-0.001057,0.000918,0.001392,-0.001942
288977,0.131046,-0.056760,0.005540,0.625000,0.026986,-0.367987,-0.296535,-0.300350,0.456300,0.022648,0.007691,-0.028666,-0.206548,0.136140,-0.014504,0.007178,-0.030095,0.021828,0.010378,0.009950
