# Dependencies

In [263]:
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import nltk
from nltk.corpus import stopwords
nltk.data.path.append("/home/albot/coding/repos/Machine-learning-AI24/data/movielens/nltk_data/")
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [264]:
# # RUN DOWNLOADS ONLY ONCE
# nltk.download("vader_lexicon")
# nltk.download("stopwords")

# Preprocessing

## Movies

In [265]:
movies = pd.read_csv("../data/movielens/movies.csv")

In [266]:
movies.head(2)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [267]:
movies["genres"] = movies["genres"].replace("(no genres listed)", np.nan)
movies["genres"] = movies["genres"].str.lower().str.replace("|", " ").str.replace("-", "").str.strip()
movies["year"] = movies["title"].str.extract(r"\((\d{4})\)")
movies["title"] = movies["title"].str.replace(r'\[.*?\]|\(.*?\)', '', regex=True).str.strip()
movies.dropna(how="any", inplace=True) # dropping 7300 NaN values

In [268]:
movies.head(2)

Unnamed: 0,movie_id,title,genres,year
0,1,Toy Story,adventure animation children comedy fantasy,1995
1,2,Jumanji,adventure children fantasy,1995


In [None]:
movies_gen = movies.copy()
movies_gen = movies_gen.drop(columns=["title", "year"])

In [276]:
movies_gen.head(2)

Unnamed: 0,movie_id,genres
0,1,adventure animation children comedy fantasy
1,2,adventure children fantasy


## Tags

In [270]:
stop_words = stopwords.words("english")
more_stopwords = []  # add internet slang, etc.
# stop_words = stop_words + more_stopwords
stemmer = nltk.SnowballStemmer("english")


# Special thanks to https://www.kaggle.com/tanulsingh077 for this function
def clean_text(text):
    """Make text lowercase, remove text in square brackets, remove links, remove punctuation
    and remove words containing numbers."""
    text = str(text).lower()
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    text = re.sub(r"<.*?>+", "", text)
    text = re.sub(r"[%s]" % re.escape(string.punctuation), "", text)
    text = re.sub(r"\n", "", text)
    text = re.sub(r"\w*\d\w*", "", text)
    return text


def remove_stopwords(text):
    text = " ".join(word for word in text.split(" ") if word not in stop_words)
    return text


def stem_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

In [271]:
tags = pd.read_csv("../data/movielens/tags.csv")
movies_tag = tags.copy()
movies_tag = movies_tag.drop(columns=["user_id", "timestamp"])
movies_tag = movies_tag.groupby('movie_id')['tag'].apply(lambda x: ' '.join(x.astype(str))).reset_index()
movies_tag["tag"] = movies_tag["tag"].apply(remove_stopwords)
movies_tag["tag"] = movies_tag["tag"].apply(clean_text)
movies_tag["tag"] = movies_tag["tag"].apply(stem_text)

In [272]:
movies_tag

Unnamed: 0,movie_id,tag
0,1,anim friendship toy anim disney pixar toy cgi ...
1,2,anim base book fantasi magic board game monkey...
2,3,sequel moldi old old age old men wed old peopl...
3,4,charact chick flick girl movi charact chick fl...
4,5,famili pregnanc wed wall age babi daughter di...
...,...,...
53447,288765,postapocalypt surviv tw suicid apocalyps bad s...
53448,288779,don camillo seri
53449,288849,addict anim short film
53450,288937,anim


## Combining tags and movies

In [273]:
movies_gentag = movies_gen.merge(movies_tag, on="movie_id", how="left")
movies_gentag["semantics"] = movies_gentag.apply(lambda x: f"{x["genres"]} {x["tag"]}" if pd.notna(x["tag"]) else x["genres"], axis=1)
movies_gentag = movies_gentag.drop(columns=["genres", "tag"])

In [274]:
movies_gentag

Unnamed: 0,movie_id,semantics
0,1,adventure animation children comedy fantasy an...
1,2,adventure children fantasy anim base book fant...
2,3,comedy romance sequel moldi old old age old me...
3,4,comedy drama romance charact chick flick girl ...
4,5,comedy famili pregnanc wed wall age babi daug...
...,...,...
79232,288967,action drama
79233,288971,action horror
79234,288975,documentary
79235,288977,crime thriller


## Dimensionality reduction

## TF-IDF

In [275]:
tfidf = TfidfVectorizer()
semantics = tfidf.fit_transform(movies_gentag["semantics"])
features = tfidf.get_feature_names_out()
movies_gentag_vec = pd.DataFrame(semantics.todense(), columns=features, index=movies_gentag["movie_id"])
movies_gentag_vec.head(2)

MemoryError: Unable to allocate 26.0 GiB for an array with shape (79237, 43982) and data type float64