# 1- Fundamental libraries

In [95]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import scrapy
import requests
from bs4 import BeautifulSoup
import time
import bs4
import matplotlib
import xformers
import torch
import transformers

In [96]:
print('-'*16)
print('*** Versions ***')
print('-'*16)
print(f'numpy=={np.__version__}')
print(f'pandas=={pd.__version__}')
print(f'scipy=={scipy.__version__}')
print(f'scrapy=={scrapy.__version__}')
print(f'seaborn=={sns.__version__}')
print(f'requests=={requests.__version__}')
print(f'bs4=={bs4.__version__}')
print(f'matplotlib=={matplotlib.__version__}')
print(f'xformers=={xformers.__version__}')
print(f'torch=={torch.__version__}')
print(f'transformers=={transformers.__version__}')
print('-'*16)

----------------
*** Versions ***
----------------
numpy==1.24.3
pandas==1.5.3
scipy==1.10.1
scrapy==2.9.0
seaborn==0.12.2
requests==2.29.0
bs4==4.12.2
matplotlib==3.7.1
xformers==0.0.20
torch==2.0.1+cu117
transformers==4.30.2
----------------


# 2- Explore Kaggle Dataset

In [6]:
# Source: https://www.kaggle.com/datasets/joyshil0599/movie-reviews-dataset-10k-scraped-data
df = pd.read_csv('../data/external/letterboxd-reviews.csv',encoding='latin_1')
df.head()

Unnamed: 0,Movie name,Release Year,Rating,Reviewer name,Review date,Review,Comment count,Like count
0,Aftersun (2022),2022,â??â??â??â??Â½,Tuomas,12-Jan-20,This review may contain spoilers.,130,"22,44 6 likes"
1,Joker (2019),2019,â??â??â??â??â??,Joao,20-Dec-22,if youâ??ve never swam in the ocean then of co...,1.8K,"22,032 likes"
2,Puss in Boots: The Last Wish (2022),2022,â??Â½,NicoPico,15-Sep-22,Puss in Boots: Into the Pussy-Verse,6 2,"21, 6 6 6 likes"
3,The Banshees of Inisherin (2022),2022,â??â??â??â??â??,Ella Kemp,8-Apr-22,I will NOT leave my donkey outside when Iâ??m sad,,"21, 6 09 likes"
4,Everything Everywhere All at Once (2022),2022,â??â??Â½,CosmonautMarkie,14-Aug-19,Watch it and have fun before film Twitter tell...,355,"20, 6 88 likes"


**poor data quality (for this application only)**

# 3- Web Scrapping

In [8]:
# TODO: Put this in config files
TOP_250_ENDPOINT = 'https://letterboxd.com/dave/list/official-top-250-narrative-feature-films/'
ALL_250_ENDOPOINTS = [TOP_250_ENDPOINT,TOP_250_ENDPOINT+'page/2/',TOP_250_ENDPOINT+'page/3/']

## 3.1 Functions

In [9]:
import requests
from bs4 import BeautifulSoup

def get_movies(endpoint):
    """
    Get a list of movie links from the specified endpoint.

    Parameters:
        endpoint (str): The URL endpoint to scrape movie links from.

    Returns:
        list: A list of movie links (URLs).
    """

    BASE_ENDPOINT = 'https://letterboxd.com/'
    all_movies = []

    # Send a GET request to the endpoint and parse the HTML response
    response = requests.get(endpoint)
    soup = BeautifulSoup(response.text, 'html.parser')

    elements = soup.find_all(class_='poster-container numbered-list-item')

    for element in elements:
        # Extract the 'data-target-link' attribute from the 'poster' div
        site = element.find('div', class_='poster').get('data-target-link')
        all_movies.append(BASE_ENDPOINT + site)

    return all_movies

In [22]:
def get_reviews(endpoint,max_reviews=20):
    """
    Scrape review data for a movie from the specified endpoint.

    Parameters:
        endpoint (str): The URL endpoint to scrape review data from.
        max_reviews (int, optional): The maximum number of reviews to scrape. Default is 20.

    Returns:
        dict: A dictionary containing movie review data with keys 'NAME', 'YEAR', 'DIRECTOR', 'SYNOPSIS',
              'RATINGS', and 'TEXT'.
    """
    
    response = requests.get(endpoint)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract relevant movie information from the webpage
    film_name = soup.find(class_='headline-1 js-widont prettify').text
    film_year = soup.find(class_='number').text
    film_director = soup.find('span', class_='prettify').text
    film_synopsys = soup.find(class_='truncate').text
    
    rating_review = []
    text_review = []
    
    count_rev = 0 # Initialize the review count
    page = 1  # Start with the first page of reviews
    
    # Loop until the desired number of reviews is reached
    while count_rev < max_reviews:
        response = requests.get(f'{endpoint}reviews/by/activity/page/{page}/')
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all review elements on the page
        reviews = soup.find_all(class_='film-detail-content')

        # Iterate over each review element
        for review in reviews:
            text = review.find(class_='-prose').text
            stars = review.find(class_='rating')
            
            # Check if the review text is long enough and not truncated
            if len(text) > 5:
                if not (text[-3] == '…' or text[-4] == '…'):
                    text_review.append(text+'***') # '***' is a separator
                    
                    # Check if a rating is available or set it to 'None'
                    if stars is None:
                        rating_review.append(' None ')
                    else:
                        rating_review.append(stars.text)
                        
                    count_rev += 1
                    if count_rev == max_reviews:
                        break
                
        # Move to the next reviews page
        page += 1
    
    # Create a dictionary with the movie review data
    data = {
        'NAME': film_name,
        'YEAR': film_year,
        'DIRECTOR': film_director,
        'SYNOPSYS': film_synopsys,
        'RATINGS': rating_review,
        'TEXT': text_review
    }
    
    return data

## 3.2 Get movie urls from the letterdboxd list

In [23]:
start = time.time()
# Define a list to store all movie links
all_movies = []

# Iterate over each endpoint in the list ALL_250_ENDOPOINTS
for endpoint in ALL_250_ENDOPOINTS:
    all_movies += get_movies(endpoint)
end = time.time()

total_time = round(end-start,2)
print(f'Sucess!!!\nExcecution time: {total_time} seconds')

Sucess!!!
Excecution time: 1.01 seconds


## 3.3 Get Metadata and short reviews for every movie

In [25]:
start = time.time()

# Create an empty dictionary to store data for each movie
data = {
        'NAME': [],
        'YEAR': [],
        'DIRECTOR': [],
        'SYNOPSYS': [],
        'RATINGS': [],
        'REVIEWS': []
    }

# Iterate over each movie in the list all_movies
for movie in all_movies:
    row = get_reviews(movie,max_reviews=20)
    
    data['NAME'].append(row['NAME'])
    data['YEAR'].append(row['YEAR'])
    data['DIRECTOR'].append(row['DIRECTOR'])
    data['SYNOPSYS'].append(row['SYNOPSYS'])
    data['RATINGS'].append(row['RATINGS'])
    data['REVIEWS'].append(row['TEXT'])
    
# Create a DataFrame from the data dictionary
df = pd.DataFrame(data=data)

end = time.time()
total_time = round(end-start,2)
print(f'Sucess!!!\nExcecution time: {total_time} seconds')

Sucess!!!
Excecution time: 519.93 seconds


## 3.2 Save raw dataset

In [27]:
df.to_csv('../data/raw/LetterboxdTop250.csv',index=False)

# 4- Clean and transform raw data

In [33]:
df = pd.read_csv('../data/raw/LetterboxdTop250.csv')
df.head()

Unnamed: 0,NAME,YEAR,DIRECTOR,SYNOPSYS,RATINGS,REVIEWS
0,Harakiri,1962,Masaki Kobayashi,\nDown-on-his-luck veteran Tsugumo Hanshirō en...,"[' ★★★★★ ', ' ★★★★½ ', ' ★★★★★ ', ' ★★★★★ ', '...",[' honor in the individual is virtue honor in ...
1,Come and See,1985,Elem Klimov,\nThe invasion of a village in Byelorussia by ...,"[' ★★★★★ ', ' ★★★★★ ', ' ★★★★½ ', ' ★★★★★ ', '...",[' as soon as this film ended i went online an...
2,12 Angry Men,1957,Sidney Lumet,\nThe defense and the prosecution have rested ...,"[' ★★★★★ ', ' ★★★★★ ', ' ★★★★ ', ' ★★★★½ ', ' ...","["" That was the best 1.5 hours of middle aged ..."
3,Seven Samurai,1954,Akira Kurosawa,\nA samurai answers a village's request for pr...,"[' ★★★★★ ', ' ★★★★★ ', ' ★★★★½ ', ' ★★★★★ ', '...","[' too many sweaty ass cheeks, 5 stars ***', '..."
4,The Godfather: Part II,1974,Francis Ford Coppola,\nIn the continuing saga of the Corleone crime...,"[' ★★★★★ ', ' ★★★★★ ', ' ★★★★★ ', ' None ', ' ...","["" young, totally fuckable al pacino and rober..."


## 4.1- Save to kaggle contribution

In [34]:
df['SYNOPSYS'] = df['SYNOPSYS'].str[1:-1]
df.to_csv('../data/interim/LetterboxdTop250-5000reviews.csv')

## 4.2- Format for preprocessing

In [35]:
df['RATINGS'] = df['RATINGS'].str[1:-1].str.split(', ')
print(f'RATINGS format: {type(df["RATINGS"][0])}') 

RATINGS format: <class 'list'>


In [36]:
df['REVIEWS'] = df['REVIEWS'].str[1:-1].str.split('\*\*\*')
print(f'REVIEWS format: {type(df["REVIEWS"][0])}')

REVIEWS format: <class 'list'>


## 4.3- prepare for NLP

In [42]:
# dictionary for convinient workflow
data_dict = df[['NAME','RATINGS','REVIEWS']].set_index('NAME').to_dict()

#Structure:
#{'RATINGS': {'NAME'}: stars,
# 'SYNOPSYS':{'NAME'}: comments}


#'RATINGS': primary key
#'NAME': secondary key
#stars: list (20 elements)
#comments: list (20 elements)

## 4.4 Cleaning

In [46]:
rev_dict = {}
for key, value in data_dict['REVIEWS'].items():
    clean_rev = []
    
    for review in value:
        # Remove unwanted characters from each review
        string = review.replace("',","").replace("' ","").replace("\'","").replace('AAAA','')
        clean_rev.append(string)
        
    #Drop the last empty comment    
    rev_dict[key] = clean_rev[:-1]
    
# Update the 'REVIEWS' key in data_dict with the cleaned review texts
data_dict['REVIEWS'] = rev_dict

print('Clean review example:')
data_dict['REVIEWS']['Harakiri'][6]

Clean review example:


' wait can we normalize calling something you’ve only seen once, your favorite movie of all time? '

## 4.5- Transformation

In [50]:
print(f'Raw rating format example: {data_dict["RATINGS"]["Harakiri"][6]}')

Raw rating format example: ' ★★★★★ '


In [45]:
def get_num_rating(raw_string):
    """
    Extract the numerical rating from the raw_data string.

    Parameters:
        raw_string (str): A string representing the raw rating data, which may contain '★' for full stars and '½' for half star.

    Returns:
        float: The numerical rating extracted from the raw_data string. If raw_data is 'None', it returns 0.0.
    """
    
    # Remove the leading and trailing characters
    stars = raw_string[2:-2]  
    count = 0.0

    if stars != 'None':
        # Iterate over each character in stars
        for char in stars:
            if char == '★':
                count += 1
            elif char == '½':
                count += 0.5

    return count

In [51]:
num_dict = {}
for key,value in data_dict['RATINGS'].items():
    num_values = []
    
    for stars in value:
        # Call the get_num_rating() function to extract the numerical rating
        num_values.append(get_num_rating(stars))
    num_dict[key] = num_values
    
# Update the 'RATINGS' key in data_dict with the extracted numerical ratings
data_dict['RATINGS'] = num_dict

print(f'Transformed rating format example: {data_dict["RATINGS"]["Harakiri"][6]}')

Transformed rating format example: 5.0


# 5- Natural language processing

## 5.1 Usefull functions

In [53]:
def get_labels(scores):
    """
    Map the model output scores to human-readable labels.

    Parameters:
        scores (numpy.ndarray): An array containing the model's output scores for each class.

    Returns:
        dict: A dictionary where the keys are human-readable labels ('Negative', 'Neutral', 'Positive')
              and the values are the corresponding scores from the model.
    """
    
    # Define a dictionary to map class indices to human-readable labels
    # info in: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment
    labels = {
    0:'Negative',
    1:'Neutral',
    2:'Positive'
    }
    
    result = {}
    for i in range(scores.shape[0]):
        result[labels[i]] = scores[i]
    return result

## 5.2 CPU version

In [54]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
from scipy.special import softmax

MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sentiment'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
#model.save_pretrained(MODEL_NAME)

It is necessary to generate a validation function for the input data. This will be done later with the model already in production, for now we are going to use TRY EXCEPT

In [59]:
sentiment_dict = {}
for key,values in data_dict['REVIEWS'].items():
    
    sentiment_list = []
    for review in values:
        # Tokenize the review text and convert it to PyTorch tensors
        encoded_input = tokenizer(review, return_tensors='pt')
        try:
            output = model(**encoded_input)
        except:
            # If an exception occurs, set labels to 'Wrong input'
            labels = 'Wrong input'
            print(review)
        else:
            # Extract the model's output scores, apply softmax, and get human-readable labels
            scores = softmax(output[0][0].detach().numpy())
            labels = get_labels(scores)
        finally:
            sentiment_list.append(labels)
    
    # Assign the sentiment_list to the key in sentiment_dict
    sentiment_dict[key] = sentiment_list

 💼 • 🕰 • 🗄 • 📰 • 🔑 • 🚪 • 🍸 • 🏸 • 🥂{𝙵𝚛𝚊𝚗} 𝚆𝚑𝚢 𝚍𝚘 𝚙𝚎𝚘𝚙𝚕𝚎 𝚑𝚊𝚟𝚎 𝚝𝚘 𝚕𝚘𝚟𝚎 𝚙𝚎𝚘𝚙𝚕𝚎 𝚊𝚗𝚢𝚠𝚊𝚢?𝕄𝕆𝕍𝕀𝔼-𝕎𝕀𝕊𝔼, 𝕋ℍ𝔼ℝ𝔼 ℍ𝔸𝕊 ℕ𝔼𝕍𝔼ℝ 𝔹𝔼𝔼ℕ 𝔸ℕ𝕐𝕋ℍ𝕀ℕ𝔾 𝕃𝕀𝕂𝔼 𝕀𝕋 - 𝕃𝕆𝕍𝔼-𝕎𝕀𝕊𝔼, 𝕃𝔸𝕌𝔾ℍ-𝕎𝕀𝕊𝔼 𝕆ℝ 𝕆𝕋ℍ𝔼ℝ-𝕎𝕀𝕊𝔼!𝐓𝐡𝐢𝐬 𝐢𝐬 𝐨𝐧𝐞 𝐨𝐟 𝐭𝐡𝐨𝐬𝐞 𝐜𝐥𝐚𝐬𝐬𝐢𝐜 𝐦𝐨𝐯𝐢𝐞𝐬 𝐭𝐡𝐚𝐭 𝐢𝐬 𝐭𝐢𝐦𝐞𝐥𝐞𝐬𝐬. 𝐖𝐢𝐭𝐭𝐲 𝐝𝐢𝐚𝐥𝐨𝐠𝐮𝐞 𝐚𝐧𝐝 𝐚 𝐬𝐢𝐦𝐩𝐥𝐞 𝐲𝐞𝐭 𝐞𝐟𝐟𝐞𝐜𝐭𝐢𝐯𝐞 𝐧𝐚𝐫𝐫𝐚𝐭𝐢𝐯𝐞 𝐦𝐚𝐤𝐞 𝐟𝐨𝐫 𝐪𝐮𝐢𝐭𝐞 𝐚𝐧 𝐢𝐧𝐭𝐞𝐫𝐞𝐬𝐭𝐢𝐧𝐠 𝐰𝐚𝐭𝐜𝐡, 𝐡𝐨𝐰𝐞𝐯𝐞𝐫, 𝐈 𝐟𝐞𝐞𝐥 𝐢𝐭 𝐥𝐨𝐬𝐭 𝐬𝐨𝐦𝐞 𝐨𝐟 𝐢𝐭𝐬 𝐬𝐩𝐚𝐫𝐤 𝐛𝐲 𝐭𝐡𝐞 𝐡𝐚𝐥𝐟𝐰𝐚𝐲 𝐦𝐚𝐫𝐤. 𝐓𝐡𝐞 𝐭𝐨𝐧𝐞𝐬 𝐬𝐡𝐢𝐟𝐭𝐞𝐝 𝐟𝐫𝐨𝐦 𝐚 𝐥𝐢𝐠𝐡𝐭𝐡𝐞𝐚𝐫𝐭𝐞𝐝 𝐫𝐨𝐦-𝐜𝐨𝐦 𝐭𝐨 𝐚 𝐦𝐨𝐫𝐞 𝐩𝐫𝐞𝐝𝐢𝐜𝐭𝐚𝐛𝐥𝐞 𝐝𝐫𝐚𝐠𝐠𝐢𝐧𝐠 𝐝𝐫𝐚𝐦𝐚. 𝐍𝐨𝐧𝐞𝐭𝐡𝐞𝐥𝐞𝐬𝐬, 𝐈 𝐞𝐧𝐣𝐨𝐲𝐞𝐝 𝐭𝐡𝐞 𝐟𝐢𝐫𝐬𝐭 𝐡𝐚𝐥𝐟 𝐞𝐧𝐨𝐮𝐠𝐡 𝐭𝐨 𝐨𝐯𝐞𝐫𝐥𝐨𝐨𝐤 𝐦𝐲 𝐬𝐥𝐢𝐠𝐡𝐭 𝐝𝐢𝐬𝐚𝐩𝐩𝐨𝐢𝐧𝐭𝐦𝐞𝐧𝐭 𝐛𝐲 𝐭𝐡𝐞 𝐞𝐧𝐝. 
", بشاعريته الفريدة والأخاذة ممزوجة بموسيقى رائعة جدا.. صنع لنا ثيو أنجيلوبولوس تحفته هاته وعالج فيها بمنتهى الجمالية هواجس وذكريات رجل يدنو من الفناء.. يقترب من الارتحال إلى عالم آخر وهو في قمة الأسى والندم على عمر مديد لن يعود." رجل يحتضر، يومه الأخير. كيف تقضي يومك الأخير؟ ما الذي يمكن أن يحدث لنا؟ ماذا سنفعل بالساعات المتبقية لنا؟ هل تتأمل الحياة التي عشتها، أم أنك تسمح لنفسك بأن تنساق، تنكشف أمام كل المصادفات: تتعقب شخصا ما، تفتح نافذة،

In [66]:
sentiment_dict['12 Angry Men'][0]

{'Negative': 0.16631296, 'Neutral': 0.32354698, 'Positive': 0.5101401}

### 5.2.1 Save results

In [60]:
import pickle

with open('../data/processed/sentiment_analysis.pkl', 'wb') as file:
    pickle.dump(sentiment_dict, file, protocol=pickle.HIGHEST_PROTOCOL)

## 5.3 GPU version
Future work