## 1) Read in data

In [1]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Load your dataset (assuming a CSV file)
reviews_from_movies_df = pd.read_csv("/kaggle/input/movie-scraper/movie_reviews.csv", index_col=0)
reviews_from_users_df = pd.read_csv("/kaggle/input/user-scraper/user_reviews.csv", index_col=0)

# combine them
reviews_df = pd.concat([reviews_from_movies_df, reviews_from_users_df], ignore_index=True)

print(reviews_df.shape, reviews_from_movies_df.shape, reviews_from_users_df.shape)
reviews_df.head()

(98834, 7) (50000, 7) (48834, 7)


Unnamed: 0,user_url,review_text,rating,review_date,movie_title,movie_year,movie_url
0,https://letterboxd.com/aykins/,feminism displayed at its worst.,0.4,28 Aug 2024,Barbie,2023.0,https://letterboxd.com/film/barbie/
1,https://letterboxd.com/maiimaoo/,Ryan Gosling... Get in my bed RN please.\nThis...,1.0,21 Jul 2023,Barbie,2023.0,https://letterboxd.com/film/barbie/
2,https://letterboxd.com/ambres29/,#decue avec tout ce que j'avais vu dessus j'pe...,0.4,28 Aug 2024,Barbie,2023.0,https://letterboxd.com/film/barbie/
3,https://letterboxd.com/whosaymaree/,im sorry to all the barbies hair that i’ve cut...,1.0,28 Aug 2024,Barbie,2023.0,https://letterboxd.com/film/barbie/
4,https://letterboxd.com/ronzcarm/,I'm just Ken,0.8,28 Aug 2024,Barbie,2023.0,https://letterboxd.com/film/barbie/


## 2) Preprocess dataset (mainly remove non-English reviews for now)

In [2]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l- done
[?25h  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993227 sha256=6115f1e5e50b878a63ca937ac3c811f8a897ad7c8235444b5f4757d78aa9ed8e
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [3]:
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
from tqdm.notebook import tqdm

tqdm.pandas()

# Ensures consistent results
DetectorFactory.seed = 0

# Function to detect language
def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

# Add a language column
reviews_df['language'] = reviews_df['review_text'].progress_apply(detect_language)

# Filter to keep only English reviews
reviews_df = reviews_df[reviews_df['language'] == 'en']

# Drop the language column if no longer needed
reviews_df.drop(columns=['language'], inplace=True)

  0%|          | 0/98834 [00:00<?, ?it/s]

## 3) Save output data

In [4]:
# Drop unwanted columns or reorder them (to simplify for now)
reviews_df = reviews_df[['review_text', 'rating']]

# Save up to here
reviews_df.to_csv("/kaggle/working/processed_reviews.csv", index=False)

In [5]:
reviews_df.head()

Unnamed: 0,review_text,rating
0,feminism displayed at its worst.,0.4
1,Ryan Gosling... Get in my bed RN please.\nThis...,1.0
3,im sorry to all the barbies hair that i’ve cut...,1.0
7,Micheal cera.,0.7
8,"Amazing, ending part made me cry 😢",0.8
