# Dependencies

In [2]:
# preprocessing
import pandas as pd
import re
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
nltk.data.path.append("/home/albot/coding/repos/Machine-learning-AI24/data/movielens/nltk_data/")
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# modelling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [3]:
# # RUN DOWNLOADS ONLY ONCE
# nltk.download("vader_lexicon")
# nltk.download("stopwords")

# Preprocessing

## Movies

In [4]:
movies = pd.read_csv("../data/movielens/movies.csv")

In [5]:
movies.head(2)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [6]:
movies["genres"] = movies["genres"].replace("(no genres listed)", np.nan)
movies["genres"] = movies["genres"].str.lower().str.replace("|", " ").str.replace("-", "").str.strip()
movies["year"] = movies["title"].str.extract(r"\((\d{4})\)")
movies["title"] = movies["title"].str.replace(r'\[.*?\]|\(.*?\)', '', regex=True).str.strip()
movies.dropna(how="any", inplace=True) # dropping 7300 NaN values

In [7]:
movies.head(2)

Unnamed: 0,movie_id,title,genres,year
0,1,Toy Story,adventure animation children comedy fantasy,1995
1,2,Jumanji,adventure children fantasy,1995


In [8]:
movies_gen = movies.copy()
movies_gen = movies_gen.drop(columns=["title", "year"])

In [9]:
movies_gen.head(2)

Unnamed: 0,movie_id,genres
0,1,adventure animation children comedy fantasy
1,2,adventure children fantasy


## Tags

In [10]:
stop_words = stopwords.words("english")
more_stopwords = []  # add internet slang, etc.
# stop_words = stop_words + more_stopwords
stemmer = nltk.SnowballStemmer("english")


# Special thanks to https://www.kaggle.com/tanulsingh077 for this function
def clean_text(text):
    """Make text lowercase, remove text in square brackets, remove links, remove punctuation
    and remove words containing numbers."""
    text = str(text).lower()
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    text = re.sub(r"<.*?>+", "", text)
    text = re.sub(r"[%s]" % re.escape(string.punctuation), "", text)
    text = re.sub(r"\n", "", text)
    text = re.sub(r"\w*\d\w*", "", text)
    return text


def remove_stopwords(text):
    text = " ".join(word for word in text.split(" ") if word not in stop_words)
    return text


def stem_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

In [11]:
tags = pd.read_csv("../data/movielens/tags.csv")
movies_tag = tags.copy()
movies_tag = movies_tag.drop(columns=["user_id", "timestamp"])
movies_tag = movies_tag.groupby('movie_id')['tag'].apply(lambda x: ' '.join(x.astype(str))).reset_index()
movies_tag["tag"] = movies_tag["tag"].apply(remove_stopwords)
movies_tag["tag"] = movies_tag["tag"].apply(clean_text)
movies_tag["tag"] = movies_tag["tag"].apply(stem_text)

In [12]:
movies_tag

Unnamed: 0,movie_id,tag
0,1,anim friendship toy anim disney pixar toy cgi ...
1,2,anim base book fantasi magic board game monkey...
2,3,sequel moldi old old age old men wed old peopl...
3,4,charact chick flick girl movi charact chick fl...
4,5,famili pregnanc wed wall age babi daughter di...
...,...,...
53447,288765,postapocalypt surviv tw suicid apocalyps bad s...
53448,288779,don camillo seri
53449,288849,addict anim short film
53450,288937,anim


## Combining tags and movies

In [13]:
movies_gentag = movies_gen.merge(movies_tag, on="movie_id", how="left")
movies_gentag["semantics"] = movies_gentag.apply(lambda x: f"{x["genres"]} {x["tag"]}" if pd.notna(x["tag"]) else x["genres"], axis=1)
movies_gentag = movies_gentag.drop(columns=["genres", "tag"])

In [14]:
movies_gentag

Unnamed: 0,movie_id,semantics
0,1,adventure animation children comedy fantasy an...
1,2,adventure children fantasy anim base book fant...
2,3,comedy romance sequel moldi old old age old me...
3,4,comedy drama romance charact chick flick girl ...
4,5,comedy famili pregnanc wed wall age babi daug...
...,...,...
79232,288967,action drama
79233,288971,action horror
79234,288975,documentary
79235,288977,crime thriller


## Dimensionality reduction

?

## TF-IDF

In [15]:
tfidf = TfidfVectorizer(min_df=2)
tfidf_matrix = tfidf.fit_transform(movies_gentag["semantics"])
tfidf_vocab = tfidf.get_feature_names_out()

In [16]:
def get_recommendations(movie_id, n=5):
    idx = movies_gentag[movies_gentag["movie_id"] == movie_id].index[0]
    distances, indices = knn.kneighbors(tfidf_matrix[idx], n_neighbors=n+1)
    return movies_gentag.iloc[indices[0][1:]]

In [17]:
knn = NearestNeighbors(
    n_neighbors=6,
    metric="cosine",
    algorithm="brute"
)
knn.fit(tfidf_matrix)
temp = get_recommendations(1, 5)
temp = temp["movie_id"].tolist()
temp = movies[movies["movie_id"].isin(temp)]
temp = temp["title"].tolist()
for i in temp:
    print(i)

Bug's Life, A
Toy Story 2
Finding Nemo
Toy Story 3
Finding Dory


## LDA

In [18]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

cvec_vocab = CountVectorizer(vocabulary=tfidf.vocabulary_)
vocab_matrix = cvec_vocab.fit_transform(movies_gentag["semantics"])

lda = LatentDirichletAllocation(
    n_components=20,
    max_iter=10,
    learning_method='online'
)
movie_topic_distributions = lda.fit_transform(vocab_matrix)

In [19]:
topic_df = pd.DataFrame(
    movie_topic_distributions, 
    index=movies_gentag['movie_id']
).reset_index()

In [20]:
feature_names = tfidf.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    top_words_idx = topic.argsort()[:-11:-1]  # Get indices of top 10 words
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"Topic {topic_idx}: {' '.join(top_words)}")

Topic 0: base polit book british england religion conspiraci london centuri period
Topic 1: drama comedy romance director romanc love woman relationship famili romant
Topic 2: end twist psycholog great plot act mindfuck dialogu good nonlinear
Topic 3: social commentari new york mental ill citi christian disturb psycholog
Topic 4: relationship refer woman man femal titl father male mother daughter
Topic 5: war oscar best world classic pictur ii histori drama histor
Topic 6: documentary stori true base documentari prison inspir drama biographi sport
Topic 7: superhero charact death shot comic femal man marvel car zombi
Topic 8: music age school come high french teenag friendship film franc
Topic 9: scifi time travel dystopia realiti futur altern postapocalypt apocalyps plot
Topic 10: western michael christma will johnni christoph depp smith tim mexico
Topic 11: tom robert spi action pixar espionag de hank martin bond
Topic 12: action crime thriller drama reveng violenc polic drug taranti

In [21]:
topic_df

Unnamed: 0,movie_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1,0.000765,0.000026,0.020072,0.000026,0.123913,0.022423,0.005022,0.059304,0.021746,0.014606,0.010560,0.310898,0.000698,0.138508,0.006525,0.226871,0.000026,0.015811,0.019320,0.002878
1,2,0.001400,0.013510,0.000045,0.000045,0.207841,0.004588,0.007854,0.159374,0.006824,0.075321,0.002914,0.001282,0.000045,0.048172,0.002572,0.000045,0.020332,0.008315,0.435162,0.004362
2,3,0.043553,0.074919,0.084326,0.000820,0.196742,0.000820,0.000820,0.022923,0.037855,0.000820,0.000820,0.041366,0.000820,0.488479,0.000820,0.000820,0.000820,0.000820,0.000820,0.000820
3,4,0.002000,0.570696,0.002000,0.002000,0.148466,0.002000,0.002000,0.002000,0.002000,0.002000,0.002000,0.002000,0.044209,0.002000,0.002000,0.053636,0.002000,0.002000,0.152993,0.002000
4,5,0.000495,0.294878,0.000495,0.000495,0.168433,0.000495,0.000495,0.000495,0.109075,0.000495,0.000495,0.129567,0.000495,0.275142,0.000495,0.000495,0.000495,0.000495,0.015974,0.000495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79232,288967,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.683333,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667
79233,288971,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.350000,0.016667,0.016667,0.016667,0.350000,0.016667,0.016667,0.016667
79234,288975,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,0.525000,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000,0.025000
79235,288977,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.683333,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667


## LSA

In [23]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=20)
lsa = svd.fit_transform(tfidf_matrix)

In [24]:
lsa_tfidf_df = pd.DataFrame(
    lsa,
    index=movies_gentag["movie_id"],
    columns=[f"topic_{i}" for i in range(20)]
)
lsa_tfidf_df

Unnamed: 0_level_0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.010467,0.008603,0.001645,0.010171,0.001096,-0.015361,0.066063,-0.022555,0.002147,0.060712,0.001568,0.026285,-0.013895,-0.014615,0.029594,0.006543,0.144832,-0.067410,0.031770,-0.067466
2,0.009144,0.004357,0.001980,0.020720,0.003380,0.000479,0.041285,0.000625,0.000609,0.037071,0.022264,0.040177,-0.006478,0.004434,0.020437,0.011453,0.132531,-0.058953,-0.019898,-0.099651
3,0.025716,0.032206,-0.001016,-0.001024,0.036179,0.002528,-0.000846,0.000787,0.002444,0.001764,0.004658,0.015925,-0.001003,0.003342,0.009446,0.002082,0.065328,-0.029058,0.027132,-0.023269
4,0.097502,0.037161,-0.003469,-0.011820,0.068959,0.007480,-0.002176,-0.000491,0.003117,0.003132,0.007301,0.027640,-0.001053,0.006929,0.018142,0.003335,0.106740,-0.045098,-0.035281,-0.106683
5,0.012577,0.016782,0.000144,0.002589,-0.001603,-0.000305,0.002448,0.000612,0.002171,0.006355,0.006134,0.028704,-0.001617,0.004015,0.012658,0.004238,0.098544,-0.045034,0.024732,-0.053409
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288967,0.504790,-0.255808,-0.007581,0.193000,-0.022367,-0.402980,0.194619,0.579540,-0.187268,-0.071348,-0.169474,-0.005313,0.026109,-0.178149,-0.024510,0.042926,-0.017196,-0.025298,0.000491,0.001324
288971,0.097491,0.018338,0.002927,0.612686,0.021545,0.191931,0.301994,0.644114,-0.053065,-0.054036,-0.213074,0.003816,-0.011660,-0.130226,-0.006042,0.015173,-0.028426,-0.016988,0.000877,0.007951
288975,0.028118,0.022348,0.998436,-0.010612,0.002509,0.002319,-0.008292,0.000981,-0.006160,-0.003641,-0.004724,-0.033518,0.001944,-0.002997,-0.005782,0.000949,-0.001058,0.000922,0.001404,-0.001899
288977,0.131046,-0.056760,0.005540,0.625000,0.026986,-0.367987,-0.296535,-0.300350,0.456300,0.022649,0.007691,-0.028666,-0.206547,0.136141,-0.014502,0.007180,-0.030096,0.021821,0.010340,0.009730
