# 1) Read in data

In [1]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Load your dataset (assuming a CSV file)
user_reviews_with_metadata = pd.read_csv("/kaggle/input/add-metadata-to-user-reviews/user_reviews_with_metadata.csv")

print(user_reviews_with_metadata.shape)
user_reviews_with_metadata.head()

(301, 19)


Unnamed: 0,movie_title,movie_user_url,review_text,rating,review_date,movie_year,liked,user_url,username,movie_url,watches,likes,lists,synopsis,fans,average_rating,total_ratings,rating_distribution,genres
0,The Bikeriders,https://letterboxd.com/kurstboy/film/the-biker...,michael shannon knocks it out of the park as a...,0.7,Watched 01 Sep 2024,2023.0,True,https://letterboxd.com/kurstboy/,kurstboy,https://letterboxd.com/film/the-bikeriders/,219787.0,61821.0,58513.0,"After a chance encounter at a local bar, stron...",472.0,3.4,192864.0,"[0.0019391903102704497, 0.006714576074332172, ...","['Crime', 'Drama']"
1,The Heartbreak Kid,https://letterboxd.com/kurstboy/film/the-heart...,40 to 50 out of 5 stars,1.0,Rewatched 28 Aug 2024,1972.0,True,https://letterboxd.com/kurstboy/,kurstboy,https://letterboxd.com/film/the-heartbreak-kid...,25535.0,8856.0,19728.0,Three days into his Miami honeymoon with needy...,291.0,3.93,18023.0,"[0.0021639016811851523, 0.004716195971813794, ...","['Romance', 'Comedy']"
2,Design for Living,https://letterboxd.com/kurstboy/film/design-fo...,lubitsch just knows how to get it done,0.9,Watched 27 Aug 2024,1933.0,True,https://letterboxd.com/kurstboy/,kurstboy,https://letterboxd.com/film/design-for-living/,25090.0,8498.0,15777.0,An independent woman can’t choose between the ...,227.0,3.9,17118.0,"[0.0009346886318495151, 0.002511975698095572, ...","['Comedy', 'Romance']"
3,King Kong,https://letterboxd.com/kurstboy/film/king-kong...,monkey mondays #34\nkind of flabbergasted at h...,0.8,Watched 26 Aug 2024,1976.0,True,https://letterboxd.com/kurstboy/,kurstboy,https://letterboxd.com/film/king-kong-1976/,36763.0,4655.0,10915.0,An oil company expedition disturbs the peace o...,17.0,2.94,21139.0,"[0.007379724679502341, 0.026396707507450684, 0...","['Fantasy', 'Adventure']"
4,Bobo the Monkey,https://letterboxd.com/kurstboy/film/bobo-the-...,monkey mondays #33,0.8,Watched 19 Aug 2024,2021.0,True,https://letterboxd.com/kurstboy/,kurstboy,https://letterboxd.com/film/bobo-the-monkey/,8341.0,2233.0,1360.0,A little monkey fakes death to evade the law.,,3.63,3653.0,"[0.004106214070626882, 0.005748699698877635, 0...",['Animation']


In [2]:
# drop url features, also wont use synopsis right now, 太麻煩
user_reviews_with_metadata = user_reviews_with_metadata.drop(columns=['movie_user_url', 'user_url', 'movie_url', 'synopsis'])

In [3]:
user_reviews_with_metadata.columns

Index(['movie_title', 'review_text', 'rating', 'review_date', 'movie_year',
       'liked', 'username', 'watches', 'likes', 'lists', 'fans',
       'average_rating', 'total_ratings', 'rating_distribution', 'genres'],
      dtype='object')

# 1.5) Handle missing values

In [4]:
from sklearn.impute import SimpleImputer
import ast
# Handling missing values

# 1. review_text: Fill with placeholder
user_reviews_with_metadata['review_text'] = user_reviews_with_metadata['review_text'].fillna("No review provided")

# 2. rating: Fill with mean (or median) of the column
rating_imputer = SimpleImputer(strategy='mean')
user_reviews_with_metadata['rating'] = rating_imputer.fit_transform(user_reviews_with_metadata[['rating']])

# 3. review_date: Fill with a specific date, or use earliest date
user_reviews_with_metadata['review_date'] = user_reviews_with_metadata['review_date'].fillna("Watched 01 Sep 2024")

# 4. movie_year: Fill with median of the column
movie_year_imputer = SimpleImputer(strategy='median')
user_reviews_with_metadata['movie_year'] = movie_year_imputer.fit_transform(user_reviews_with_metadata[['movie_year']])

# 5. liked: Fill with default value, False
user_reviews_with_metadata['liked'] = user_reviews_with_metadata['liked'].fillna(False)

# 6. username: Fill with placeholder
user_reviews_with_metadata['username'] = user_reviews_with_metadata['username'].fillna("Unknown")

# 6. move_title: Fill with placeholder
user_reviews_with_metadata['movie_title'] = user_reviews_with_metadata['movie_title'].fillna("Unknown")

# 7. watches, likes, lists, fans: Fill with zeros or median
numerical_columns = ['watches', 'likes', 'lists', 'fans']
for col in numerical_columns:
    user_reviews_with_metadata[col] = user_reviews_with_metadata[col].fillna(0)

# 8. average_rating, total_ratings: Fill with median
rating_columns = ['average_rating', 'total_ratings']
for col in rating_columns:
    imputer = SimpleImputer(strategy='median')
    user_reviews_with_metadata[col] = imputer.fit_transform(user_reviews_with_metadata[[col]])

# 9. rating_distribution: Convert missing values to list of zeros
num_ratings = 10
user_reviews_with_metadata['rating_distribution'] = user_reviews_with_metadata['rating_distribution'].fillna(str([0.0] * num_ratings))
user_reviews_with_metadata['rating_distribution'] = user_reviews_with_metadata['rating_distribution'].apply(lambda x: ast.literal_eval(x))

# 10. genres: Fill missing values with an empty list
user_reviews_with_metadata['genres'] = user_reviews_with_metadata['genres'].fillna("[]")
user_reviews_with_metadata['genres'] = user_reviews_with_metadata['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)


## 2) Prepare rating label for categorization its currently scaled from 0 to 1 need that to be 0,1,2...9

In [5]:
user_reviews_with_metadata['rating'] = (user_reviews_with_metadata['rating'] * 10 -1).astype(int)

# 3) Lowercase text data

In [6]:
user_reviews_with_metadata['review_text'] = user_reviews_with_metadata['review_text'].str.lower()
#user_reviews_with_metadata['synopsis'] = user_reviews_with_metadata['synopsis'].str.lower()

# 4) Extract watched date, and whether it was a rewatch or not

In [7]:
def add_rewatched_col(review):
    if review.review_date.split(" ")[0]=="Rewatched":
        return 1
    else:
        return 0
    
def review_date_to_timestamp(review):
    return pd.to_datetime("".join(review.review_date.split(" ")[1:]), format='%d%b%Y').timestamp()
        
    
user_reviews_with_metadata['rewatched'] = user_reviews_with_metadata.apply(add_rewatched_col ,axis=1)
user_reviews_with_metadata['review_date'] = user_reviews_with_metadata.apply(review_date_to_timestamp ,axis=1)

# 5) Normalize continuous variables

In [8]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Normalize continuous columns
continuous_columns = ['watches', 'likes', 'lists', 'fans', 'movie_year', 'average_rating', 'total_ratings', 'review_date']
user_reviews_with_metadata[continuous_columns] = scaler.fit_transform(user_reviews_with_metadata[continuous_columns])


# 5.5) Handling rating_distribution

In [9]:
import ast
import pandas as pd

# Split the list into separate columns
num_ratings = 10
rating_distribution_df = pd.DataFrame(user_reviews_with_metadata['rating_distribution'].to_list(), columns=[f'rating_dist_{i}' for i in range(num_ratings)])

# Reset the index of both DataFrames
user_reviews_with_metadata.reset_index(drop=True, inplace=True)
rating_distribution_df.reset_index(drop=True, inplace=True)

# Concatenate with the main DataFrame
user_reviews_with_metadata = pd.concat([user_reviews_with_metadata, rating_distribution_df], axis=1)

# Drop the original rating_distribution column
#user_reviews_with_metadata = user_reviews_with_metadata.drop(columns=['rating_distribution'])

# Display the updated DataFrame
user_reviews_with_metadata.head()

Unnamed: 0,movie_title,review_text,rating,review_date,movie_year,liked,username,watches,likes,lists,...,rating_dist_0,rating_dist_1,rating_dist_2,rating_dist_3,rating_dist_4,rating_dist_5,rating_dist_6,rating_dist_7,rating_dist_8,rating_dist_9
0,The Bikeriders,michael shannon knocks it out of the park as a...,6,1.0,0.993151,True,kurstboy,0.058091,0.035069,0.117485,...,0.001939,0.006715,0.008016,0.039593,0.07091,0.230681,0.293186,0.256704,0.051762,0.040495
1,The Heartbreak Kid,40 to 50 out of 5 stars,9,0.994638,0.643836,True,kurstboy,0.006749,0.005024,0.039611,...,0.002164,0.004716,0.004328,0.020696,0.026799,0.101648,0.164401,0.334184,0.180935,0.160129
2,Design for Living,lubitsch just knows how to get it done,8,0.993298,0.376712,True,kurstboy,0.006631,0.004821,0.031678,...,0.000935,0.002512,0.003622,0.016766,0.029034,0.12519,0.206099,0.345309,0.145403,0.125131
3,King Kong,monkey mondays #34\nkind of flabbergasted at h...,7,0.991957,0.671233,True,kurstboy,0.009717,0.002641,0.021916,...,0.00738,0.026397,0.041014,0.152562,0.179479,0.310753,0.151615,0.094943,0.014286,0.021572
4,Bobo the Monkey,monkey mondays #33,7,0.982574,0.979452,True,kurstboy,0.002205,0.001267,0.002731,...,0.004106,0.005749,0.007391,0.030933,0.054202,0.210512,0.234875,0.274295,0.050917,0.127019


# 6) One-Hot Encode Categorical Variables

For liked, genres, and username, one-hot encoding is appropriate. However, username might have a high cardinality, so you could consider alternatives like label encoding or entity embeddings if the model complexity allows.

In [10]:
user_reviews_with_metadata.head()

Unnamed: 0,movie_title,review_text,rating,review_date,movie_year,liked,username,watches,likes,lists,...,rating_dist_0,rating_dist_1,rating_dist_2,rating_dist_3,rating_dist_4,rating_dist_5,rating_dist_6,rating_dist_7,rating_dist_8,rating_dist_9
0,The Bikeriders,michael shannon knocks it out of the park as a...,6,1.0,0.993151,True,kurstboy,0.058091,0.035069,0.117485,...,0.001939,0.006715,0.008016,0.039593,0.07091,0.230681,0.293186,0.256704,0.051762,0.040495
1,The Heartbreak Kid,40 to 50 out of 5 stars,9,0.994638,0.643836,True,kurstboy,0.006749,0.005024,0.039611,...,0.002164,0.004716,0.004328,0.020696,0.026799,0.101648,0.164401,0.334184,0.180935,0.160129
2,Design for Living,lubitsch just knows how to get it done,8,0.993298,0.376712,True,kurstboy,0.006631,0.004821,0.031678,...,0.000935,0.002512,0.003622,0.016766,0.029034,0.12519,0.206099,0.345309,0.145403,0.125131
3,King Kong,monkey mondays #34\nkind of flabbergasted at h...,7,0.991957,0.671233,True,kurstboy,0.009717,0.002641,0.021916,...,0.00738,0.026397,0.041014,0.152562,0.179479,0.310753,0.151615,0.094943,0.014286,0.021572
4,Bobo the Monkey,monkey mondays #33,7,0.982574,0.979452,True,kurstboy,0.002205,0.001267,0.002731,...,0.004106,0.005749,0.007391,0.030933,0.054202,0.210512,0.234875,0.274295,0.050917,0.127019


In [11]:
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder

# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Fit and transform the genres
genres_onehot = mlb.fit_transform(user_reviews_with_metadata['genres'])

# Create a DataFrame with the one-hot encoded genres
genres_onehot_df = pd.DataFrame(genres_onehot, columns=mlb.classes_)

# Concatenate with the original DataFrame
user_reviews_with_metadata = pd.concat([user_reviews_with_metadata, genres_onehot_df], axis=1)

# Drop the original genres column if no longer needed
user_reviews_with_metadata = user_reviews_with_metadata.drop(columns=['genres'])

In [12]:
user_reviews_with_metadata.head()

Unnamed: 0,movie_title,review_text,rating,review_date,movie_year,liked,username,watches,likes,lists,...,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War
0,The Bikeriders,michael shannon knocks it out of the park as a...,6,1.0,0.993151,True,kurstboy,0.058091,0.035069,0.117485,...,0,0,0,0,0,0,0,0,0,0
1,The Heartbreak Kid,40 to 50 out of 5 stars,9,0.994638,0.643836,True,kurstboy,0.006749,0.005024,0.039611,...,0,0,0,0,0,1,0,0,0,0
2,Design for Living,lubitsch just knows how to get it done,8,0.993298,0.376712,True,kurstboy,0.006631,0.004821,0.031678,...,0,0,0,0,0,1,0,0,0,0
3,King Kong,monkey mondays #34\nkind of flabbergasted at h...,7,0.991957,0.671233,True,kurstboy,0.009717,0.002641,0.021916,...,1,0,0,0,0,0,0,0,0,0
4,Bobo the Monkey,monkey mondays #33,7,0.982574,0.979452,True,kurstboy,0.002205,0.001267,0.002731,...,0,0,0,0,0,0,0,0,0,0


# 7) Label encode username

In [13]:
from sklearn.preprocessing import LabelEncoder

# Label encode 'username'
label_encoder = LabelEncoder()
user_reviews_with_metadata['username_encoded'] = label_encoder.fit_transform(user_reviews_with_metadata['username'])
user_reviews_with_metadata['movie_title_encoded'] = label_encoder.fit_transform(user_reviews_with_metadata['movie_title'])


# Drop the original username column
user_reviews_with_metadata = user_reviews_with_metadata.drop(columns=['username', 'movie_title'])

# 8) Remove non-English reviews

In [14]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l- done
[?25h  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993227 sha256=cbdb4a57b4145c20d28908ff999041b3a283b98075820fe3f1e900146658390b
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [15]:
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
from tqdm.notebook import tqdm

tqdm.pandas()

# Ensures consistent results
DetectorFactory.seed = 0

# Function to detect language
def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

# Add a language column
user_reviews_with_metadata['language'] = user_reviews_with_metadata['review_text'].progress_apply(detect_language)

# Filter to keep only English reviews
user_reviews_with_metadata = user_reviews_with_metadata[user_reviews_with_metadata['language'] == 'en']

# Drop the language column if no longer needed
user_reviews_with_metadata.drop(columns=['language'], inplace=True)
print(user_reviews_with_metadata.shape)

  0%|          | 0/301 [00:00<?, ?it/s]

(273, 43)


## 9) Save output data

In [16]:
# Drop unwanted columns or reorder them (to simplify for now)
#reviews_df = reviews_df[['review_text', 'rating']]

# Save up to here
user_reviews_with_metadata.to_csv("/kaggle/working/user_reviews_with_metadata.csv", index=False)

In [17]:
user_reviews_with_metadata.head()

Unnamed: 0,review_text,rating,review_date,movie_year,liked,watches,likes,lists,fans,average_rating,...,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,username_encoded,movie_title_encoded
0,michael shannon knocks it out of the park as a...,6,1.0,0.993151,True,0.058091,0.035069,0.117485,0.002195,0.6,...,0,0,0,0,0,0,0,0,2,201
1,40 to 50 out of 5 stars,9,0.994638,0.643836,True,0.006749,0.005024,0.039611,0.001353,0.765625,...,0,0,0,1,0,0,0,0,2,220
2,lubitsch just knows how to get it done,8,0.993298,0.376712,True,0.006631,0.004821,0.031678,0.001056,0.75625,...,0,0,0,1,0,0,0,0,2,57
3,monkey mondays #34\nkind of flabbergasted at h...,7,0.991957,0.671233,True,0.009717,0.002641,0.021916,7.9e-05,0.45625,...,0,0,0,0,0,0,0,0,2,103
4,monkey mondays #33,7,0.982574,0.979452,True,0.002205,0.001267,0.002731,0.0,0.671875,...,0,0,0,0,0,0,0,0,2,27
