Install the required libraries

In [None]:
!pip install pandas
!pip install nltk
!pip install transformers

Load the required libraries

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

The main code is here

In [None]:
# Load the dataset with the reviews
df_reviews_top_250 = pd.read_csv("250_top_movies_reviews.csv")
df_reviews_top_250.head(5)

In [None]:
# Drop column 'Unnamed: 0'
df_reviews_top_250 = df_reviews_top_250.drop(columns=['Unnamed: 0'])

In [None]:
# Load the dataset with the movies
df_top_250_movies = pd.read_csv("250_top_movies.csv")
df_top_250_movies.head(5)

In [None]:
# Drop column 'Unnamed: 0'
df_top_250_movies = df_top_250_movies.drop(columns=['Unnamed: 0'])

In [None]:
# Check if there are any titles that they 
df_top_250_movies['Title'] = df_top_250_movies['Title'].str.strip()
print(set(df_reviews_top_250['Movie Title']).difference(set(df_top_250_movies['Title'])))

In [None]:
# Replace specific titles from the df_top_250_movies with the titles from df_reviews_top_250
# Replace specific values in 'Title'
replacement_dict = {'Rush I': 'Rush', 'Joker I': 'Joker', 'Room I': 'Room', 'Spotlight I': 'Spotlight', 'Inside Out I': 'Inside Out', 'Coco I': 'Coco', 'The Father I': 'The Father'}
df_top_250_movies['Title'] = df_top_250_movies['Title'].replace(replacement_dict)

In [None]:
# Examine how many NaN values on the reviews dataframe
# Review Title
print("Review Title")
print(df_reviews_top_250['Review Title'].isna().sum())
print("")
# Review Text
print("Review Text")
print(df_reviews_top_250['Review Text'].isna().sum())
print("")
# Rating
print("Rating")
print(df_reviews_top_250['Rating'].isna().sum())
print("")
# Movie Title
print("Movie Title")
print(df_reviews_top_250['Movie Title'].isna().sum())
print("")

In [None]:
# Drop all rows with NaN values
df_reviews_top_250.dropna(subset=['Review Title','Review Text','Rating'], inplace=True)

In [None]:
df_reviews_top_250

In [None]:
df_reviews_top_250 = df_reviews_top_250.reset_index(drop=True)
df_reviews_top_250

In [None]:
# Check if each element in the 'Review Title' column is a string
for title in df_reviews_top_250['Review Title']:
    if not isinstance(title, str):
        print("Non-string object found in 'Review Title' column!")
        break
else:
    print("'Review Title' column contains only strings")

In [None]:
# Check if each element in the 'Review Text' column is a string
for review in df_reviews_top_250['Review Text']:
    if not isinstance(review, str):
        print("Non-string object found in 'Review Text' column!")
        break
else:
    print("'Review Text' column contains only strings")

In [None]:
# Check if each element in the 'Movie Title' column is a string
for movie_title in df_reviews_top_250['Movie Title']:
    if not isinstance(movie_title, str):
        print("Non-string object found in 'Movie Title' column!")
        break
else:
    print("'Movie Title' column contains only strings")

In [None]:
# Reviews dataset to csv (stopwords included)
df_reviews_top_250.to_csv('IMDB_Reviews_Top_250_preprocessed.csv')

In [None]:
# Define a function for removing stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [None]:
# Removing Stopwords
df_reviews_top_250['Review Title'] =  df_reviews_top_250['Review Title'].apply(remove_stopwords)
df_reviews_top_250['Review Text'] =  df_reviews_top_250['Review Text'].apply(remove_stopwords)
print(df_reviews_top_250)

In [None]:
# Reviews dataset to csv (stopwords removed)
df_reviews_top_250.to_csv('IMDB_Reviews_Top_250_preprocessed_without_stopwords.csv')

In [None]:
# Find how many reviews each movie has
grouped_data_reviews = df_reviews_top_250.groupby('Movie Title').size().reset_index(name='Reviews')
grouped_data_reviews

In [None]:
# Find the average Rating for each movie based on reviews' ratings
grouped_data_ratings = df_reviews_top_250.groupby('Movie Title')['Rating'].mean().reset_index(name='Rating')
grouped_data_ratings

In [None]:
grouped_data_reviews = grouped_data_reviews.rename(columns={'Movie Title': 'Title'})
grouped_data_ratings = grouped_data_ratings.rename(columns={'Movie Title': 'Title'})

In [None]:
# Create a dataframe with all the info about each movie
df_info_top_250_full = pd.merge(grouped_data_reviews, grouped_data_ratings, on='Title').merge(df_top_250_movies, on='Title', how='outer')

In [None]:
df_info_top_250_full

In [None]:
# Movies full info dataset to csv
df_info_top_250_full.to_csv('df_info_top_250_full.csv')