In [34]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
import string
import stopwords
import nltk
from nltk.corpus import stopwords
nltk.data.path.append("/home/albot/coding/repos/Machine-learning-AI24/data/movielens/nltk_data/")
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [35]:
tags = pd.read_csv("../data/movielens/tags.csv")
movies = pd.read_csv("../data/movielens/movies.csv")
ratings = pd.read_csv("../data/movielens/ratings.csv")

In [36]:
# tags
print(f"There are {tags["movie_id"].nunique()} movies in tags")
print(f"There are {tags["user_id"].nunique()} users in tags")

There are 53452 movies in tags
There are 25280 users in tags


In [37]:
# movies
print(f"There are {movies["movie_id"].nunique()} movies in movies")

There are 86537 movies in movies


In [38]:
# ratings
print(f"There are {ratings["movie_id"].nunique()} movies in ratings")
print(f"There are {ratings["user_id"].nunique()} users in ratings")

There are 83239 movies in ratings
There are 330975 users in ratings


# EDA

In [39]:
tags["tag"] = tags["tag"].astype(str)
tags["char_length"] = tags["tag"].map(lambda x: len(x))
tags["word_length"] = tags["tag"].map(lambda x: len(x.split(" ")))
tags.head()

Unnamed: 0,user_id,movie_id,tag,timestamp,char_length,word_length
0,10,260,good vs evil,1430666558,12,3
1,10,260,Harrison Ford,1430666505,13,2
2,10,260,sci-fi,1430666538,6,1
3,14,1221,Al Pacino,1311600756,9,2
4,14,1221,mafia,1311600746,5,1


In [40]:
pd.set_option("display.float_format", "{:.2f}".format)
tags[["char_length", "word_length"]].describe()

Unnamed: 0,char_length,word_length
count,2328315.0,2328315.0
mean,11.09,1.66
std,5.69,0.92
min,1.0,1.0
25%,7.0,1.0
50%,10.0,1.0
75%,14.0,2.0
max,241.0,42.0


In [41]:
print(f"There are {tags["tag"].nunique()} unique words in tags.")

There are 153950 unique words in tags.


# Preprocessing

In [42]:
# Special thanks to https://www.kaggle.com/tanulsingh077 for this function
def clean_text(text):
    """Make text lowercase, remove text in square brackets, remove links, remove punctuation
    and remove words containing numbers."""
    text = str(text).lower()
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    text = re.sub(r"<.*?>+", "", text)
    text = re.sub(r"[%s]" % re.escape(string.punctuation), "", text)
    text = re.sub(r"\n", "", text)
    text = re.sub(r"\w*\d\w*", "", text)
    return text

In [43]:
tags["tag"] = tags["tag"].apply(clean_text)
print(f"There are {tags["tag"].nunique()} unique words in tags.")

There are 141025 unique words in tags.


In [44]:
stop_words = stopwords.words("english")
more_stopwords = []  # add internet slang, etc.
# stop_words = stop_words + more_stopwords

def remove_stopwords(text):
    text = " ".join(word for word in text.split(" ") if word not in stop_words)
    return text

tags["tag"] = tags["tag"].apply(remove_stopwords)
print(f"There are {tags["tag"].nunique()} unique words in tags.")

There are 137730 unique words in tags.


In [45]:
stemmer = nltk.SnowballStemmer("english")

def stem_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

tags["tag"] = tags["tag"].apply(stem_text)
print(f"There are {tags["tag"].nunique()} unique words in tags.")

There are 130114 unique words in tags.


In [None]:
# temp = tags.head(25)
# temp

Unnamed: 0,user_id,movie_id,tag,timestamp,char_length,word_length
0,10,260,good vs evil,1430666558,12,3
1,10,260,harrison ford,1430666505,13,2
2,10,260,scifi,1430666538,6,1
3,14,1221,al pacino,1311600756,9,2
4,14,1221,mafia,1311600746,5,1
5,14,58559,atmospher,1311530439,11,1
6,14,58559,batman,1311530391,6,1
7,14,58559,comic book,1311530398,10,2
8,14,58559,dark,1311530428,4,1
9,14,58559,heath ledger,1311530404,12,2


In [None]:
# temp_movie = temp.groupby("movie_id").agg({"tag": " ".join})
# temp_movie

Unnamed: 0_level_0,tag
movie_id,Unnamed: 1_level_1
47,kevin spacey morgan freeman power end
260,good vs evil harrison ford scifi
296,crime cult film quentin tarantino
1221,al pacino mafia
57183,famili
58559,atmospher batman comic book dark heath ledger ...


In [None]:
# tags_sentiment = tags.copy()
# sia = SentimentIntensityAnalyzer()

# # Function to get sentiment score for a tag
# def get_sentiment(tag):
#     if isinstance(tag, str):
#         return sia.polarity_scores(tag)['compound']
#     return 0  # Neutral for missing values

# # Apply sentiment analysis to each tag
# tags_sentiment['sentiment_score'] = tags_sentiment['tag'].apply(get_sentiment)

In [None]:
# tags_sentiment.head(25)

Unnamed: 0,user_id,movie_id,tag,timestamp,char_length,word_length,sentiment_score
0,10,260,good vs evil,1430666558,12,3,-0.36
1,10,260,harrison ford,1430666505,13,2,0.0
2,10,260,scifi,1430666538,6,1,0.0
3,14,1221,al pacino,1311600756,9,2,0.0
4,14,1221,mafia,1311600746,5,1,0.0
5,14,58559,atmospher,1311530439,11,1,0.0
6,14,58559,batman,1311530391,6,1,0.0
7,14,58559,comic book,1311530398,10,2,0.0
8,14,58559,dark,1311530428,4,1,0.0
9,14,58559,heath ledger,1311530404,12,2,0.0


In [None]:
# # Group by movieId and calculate average sentiment
# tags_sentiment_movies = tags_sentiment.groupby('movie_id')['sentiment_score'].agg(['mean', 'count']).reset_index()
# tags_sentiment_movies.rename(columns={'mean': 'avg_sentiment'}, inplace=True)

# # Filter out movies with too few tags (optional)
# tags_sentiment_movies = tags_sentiment_movies[tags_sentiment_movies['count'] >= 3]

# print(tags_sentiment_movies.head())
# tags_sentiment_movies[(tags_sentiment_movies["avg_sentiment"] >= -0.05) & tags_sentiment_movies["avg_sentiment"] <= 0.05].count()

   movie_id  avg_sentiment  count
0         1           0.04   1440
1         2          -0.03    653
2         3           0.05     36
3         4           0.02     13
4         5          -0.01     68


movie_id         23280
avg_sentiment    23280
count            23280
dtype: int64

# Ratings

In [None]:
user_movie_merge = tags.groupby(["user_id", "movie_id"])["tag"].apply(" ".join).reset_index()
user_movie_merge = user_movie_merge.merge(ratings, on=["user_id", "movie_id"], how="left")
user_movie_merge = user_movie_merge.drop(columns=["timestamp"]).dropna(how="any")
user_movie_merge.head()

Unnamed: 0,user_id,movie_id,tag,rating
0,10,260,good vs evil harrison ford scifi,4.5
2,14,58559,atmospher batman comic book dark heath ledger ...,5.0
3,16,57183,famili,4.5
4,26,296,crime cult film quentin tarantino,4.5
5,37,47,kevin spacey morgan freeman power end twist end,5.0


In [None]:
u37 = user_movie_merge[user_movie_merge["user_id"] == 37]
u37

Unnamed: 0,user_id,movie_id,tag,rating
5,37,47,kevin spacey morgan freeman power end twist end,5.00
6,37,165,action,4.00
7,37,293,gari oldman great act jean reno natali portman,5.00
8,37,480,classic steven spielberg,4.00
9,37,527,act john william move,5.00
...,...,...,...,...
77,37,202103,artifici intellig postapocalypt,4.00
78,37,202429,aimless bore plot long,0.50
79,37,204698,great act joaquin phoenix tragedi,5.00
80,37,206857,energi long,1.50
