# APAN5430 NLP Term Project: IMDb Movie Reviews Summary System

## Section 2 Building Flask

### 1.1 Data Collection

#### 1.1.1 Load Data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
file_path = '/content/drive/My Drive/NLP Final Project/'

In [6]:
import pandas as pd

# Load the merged dataset
data = pd.read_csv(file_path + 'merged.csv')

# View the first 10 rows of data
data.head(10)

Unnamed: 0,username,rating,helpful,total,date,title,review,Movie_Name
0,diac228,9,78,83,11 July 2008,With enough energy to generate an entire city...,"The 90s was home to a slew of great raw, uncut...",The Rock 1996
1,F0RCE,10,287,342,21 March 2001,Definitely underrated!\n,"""The Rock"" combined action, adventure, comedy,...",The Rock 1996
2,mstomaso,8,73,89,27 March 2005,Adrenalized..... yikes\n,The Rock is one of my all-time favorite suspen...,The Rock 1996
3,charlie-303,10,140,184,31 January 2005,An absolute classic\n,What can I say other than Connery at his best ...,The Rock 1996
4,clanger1977,9,114,150,1 May 2004,Review\n,The Rock was an awesome film to see in the cin...,The Rock 1996
5,Dandaman6924,10,117,155,5 November 2004,Classic Action Movie\n,This movie is all a man could hope for in a mo...,The Rock 1996
6,Movie-12,8,72,104,20 March 2000,"An action packed, high octane thriller with g...",THE ROCK / (1996) ***1/2<br/><br/>Starring: Se...,The Rock 1996
7,ajohnwoofilm,Null,11,13,7 December 2001,Michael Bay's masterpiece\n,Although i have seen many of Bay's films and e...,The Rock 1996
8,RichT3,9,5,5,29 November 2001,One of the BEST action movies ever made\n,This was the first Michael Bay movie I had the...,The Rock 1996
9,popsucksrapswallows,Null,75,115,5 July 2004,"Too watchable, only the music gets tiresome\n",Fantastically made. Undoubtedly one of the bes...,The Rock 1996


In [7]:
# change the datatype
data['rating'] = pd.to_numeric(data['rating'], errors='coerce')
data['date'] = pd.to_datetime(data['date'], errors='coerce')

In [8]:
# Fill missing values in the 'rating' column with a default value (e.g., 0)
data['rating'].fillna(0, inplace=True)

# Drop rows where 'rating' is equal to 0
data = data[data['rating'] != 0]

# Check for missing values again
print(data.isnull().sum())

username      0
rating        0
helpful       0
total         0
date          0
title         0
review        0
Movie_Name    0
dtype: int64


In [9]:
# Reset the index of the DataFrame
data.reset_index(drop=True, inplace=True)

### 1.2 Selecting Top 100 Movies (Based on IMDb's Formula)

$$
\text{Weighted Rating (WR)} = \left( \frac{v}{v + m} \right) \times R + \left( \frac{m}{v + m} \right) \times C
$$

Where:
- \( v \) = number of votes for the movie (i.e., the number of reviews)
- \( m \) = minimum votes required to be listed in the Top 250 (or any other list)
- \( R \) = average rating of the movie
- \( C \) = the mean vote across the whole report (or the mean rating of all movies)


In [10]:
# Assuming your DataFrame is named df
# Calculate C (the mean rating across all movies)
C = data['rating'].mean()

# Set the minimum number of reviews required to be in the list
m = 50  # You can adjust this number

# Group by Movie_Name and calculate average rating and total number of reviews
movie_stats = data.groupby('Movie_Name').agg(
    avg_rating=('rating', 'mean'),
    total_reviews=('review', 'count')
)

# Function to compute weighted rating for each movie
def weighted_rating(x, m=m, C=C):
    v = x['total_reviews']  # total number of reviews
    R = x['avg_rating']
    return (v/(v+m) * R) + (m/(m+v) * C)

# Apply the weighted rating formula
movie_stats['weighted_rating'] = movie_stats.apply(weighted_rating, axis=1)

# Filter out movies that have fewer than m reviews
filtered_movies = movie_stats[movie_stats['total_reviews'] >= m]

# Sort movies based on weighted rating
sorted_movies = filtered_movies.sort_values('weighted_rating', ascending=False)

# Select the top 100 movies
top_100_movies = sorted_movies.head(100)

In [11]:
# Filter the original dataframe to keep reviews of the top 100 movies
top_100 = data[data['Movie_Name'].isin(top_100_movies.index)]
print(top_100.shape)
top_100.head()

(117857, 8)


Unnamed: 0,username,rating,helpful,total,date,title,review,Movie_Name
964,g_cotterell,10.0,822,898,2019-11-16,Puts my faith in the movie business again\n,This is what cinema is supposed to be! Amazing...,Ford v Ferrari 2019
965,nickfarf-42699,10.0,518,590,2019-11-15,As close as possible to the truth but!\n,I was at the Philadelphia Film center premiere...,Ford v Ferrari 2019
966,jordanucsd,8.0,47,50,2020-02-14,"""I had no idea. If only my dad were alive to ...","I've seen a lot of car movies, as a self-profe...",Ford v Ferrari 2019
967,ymyuseda,10.0,323,382,2019-11-20,Oscar Winning Performance\n,Rating 10/10\nInspired movie of the year. It i...,Ford v Ferrari 2019
968,Kikisaurus,8.0,62,72,2020-02-12,"I know nothing of cars or racing, nor do I ca...","To start off, I have always hated racing. Yes,...",Ford v Ferrari 2019


### 1.3 Text Normalization

In [12]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download the necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [13]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters but keeping ,/./!/?
    text = re.sub(r'[^\w\s,\.!?]', '', text)
    # Tokenization
    tokens = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join words back to string
    text = ' '.join(tokens)
    return text

# Apply preprocessing to the review column
top_100['cleaned_review'] = top_100['review'].apply(preprocess_text)

# Display the first few rows of the processed data
top_100[['review', 'cleaned_review']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_100['cleaned_review'] = top_100['review'].apply(preprocess_text)


Unnamed: 0,review,cleaned_review
964,This is what cinema is supposed to be! Amazing...,cinema supposed be! amazing chemistry acting l...
965,I was at the Philadelphia Film center premiere...,philadelphia film center premiere 111119 view ...
966,"I've seen a lot of car movies, as a self-profe...","ive seen lot car movies, selfprofessed car nut..."
967,Rating 10/10\nInspired movie of the year. It i...,rating 1010 inspired movie year. wonderfully m...
968,"To start off, I have always hated racing. Yes,...","start off, always hated racing. yes, hated. th..."


In [125]:
top_100['review'] = top_100['review'].astype(str)
top_100['cleaned_review'] = top_100['cleaned_review'].astype(str)

In [126]:
# Specify the file path and name for the new CSV file

# Save the DataFrame to a CSV file
top_100.to_csv(file_path + 'top_100.csv', index=False)

## 2.1 Text Summarization

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [85]:
import pandas as pd
file_path = '/content/drive/My Drive/NLP Final Project/'

df = pd.read_csv(file_path + 'top_100.csv')

In [9]:
# Sort the DataFrame based on 'Movie_Name' and 'helpful' votes
df_sorted = df.sort_values(by=['Movie_Name', 'helpful'], ascending=[True, False])

# Group by 'Movie_Name' and take the top 100 reviews for each movie
top_reviews = df_sorted.groupby('Movie_Name').head(100)

# Concatenate the top 100 reviews for each movie
#df_grouped = top_reviews.groupby('Movie_Name')['cleaned_review'].apply(' '.join).reset_index()
df_grouped = top_reviews.groupby('Movie_Name')['review'].apply(' '.join).reset_index()

In [2]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.3 MB[0m [31m4.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/1.3 MB[0m [31m7.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m1.0/1.3 MB[0m [31m9.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [18]:
import sentencepiece
from transformers import BartTokenizer, BartForConditionalGeneration

# Initialize the BART-large-cnn tokenizer and model
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

def summarize_text(text):
    # Encode the text to tensor
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)

    # Generate the summary
    summary_ids = model.generate(inputs['input_ids'], max_length=200, min_length=80, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

# Assuming your DataFrame is named df_grouped and it has a column 'cleaned_review'
# Apply text summarization to each movie's reviews
df_grouped['summary'] = df_grouped['review'].apply(summarize_text)

# The resulting DataFrame df_grouped will have two columns: 'Movie_Name' and 'summary'



In [19]:
df_grouped

Unnamed: 0,Movie_Name,review,summary
0,12 Angry Men 1957,An excellent courtroom drama with a unique twi...,The film is an excellent courtroom drama with ...
1,A Walk to Remember 2002,I'm a 17 year old male teenager who happened t...,AWTR contains a heartfelt conclusion to a stor...
2,Alien 1979,"Back in early 20th century, Lumière brothers d...",'Alien' is the only perfect movie in the histo...
3,Aliens 1986,THE sci-fi movie. It's nearly perfect in every...,The adrenaline pumps like in no other film and...
4,All Quiet on the Western Front 1930,Erich Maria Remarque's novel and the film made...,Erich Maria Remarque's novel and the film made...
...,...,...,...
95,Toy Story 1995,I am a big fan of the animated movies coming f...,Tom Hanks and Tim Allen provided excellent voi...
96,Toy Story 2 1999,I just saw Toy Story 2 an hour ago and I must ...,The first Toy Story was largely confined to li...
97,Toy Story 3 2010,The best magic tricks in the world are ones th...,"""Toy Story 3"" gets 10 of 10 blazing stars. The..."
98,Uri: The Surgical Strike 2019,If I have to say in one word then it'd be MAST...,Director Aditya Dhar has done an amazing job w...


In [20]:
df_grouped.summary[0]

'The film is an excellent courtroom drama with a unique twist. Instead of following the trial itself, the viewer has a unique chance to observe the events behind the closed doors of a jury room. The character of each of the jurors emerges through a wonderful mix of perfect casting, excellent dialogue and near-flawless acting. The plot of the film is excellent and it is fascinating to see what little things can influence which way a verdict goes.'

## 2.2 Obtaining Additional Information about the Movies

In [73]:
import pandas as pd
import glob


path = '/content/drive/My Drive/NLP Final Project/1_movies_per_genre/'
all_files = glob.glob(path + "/*.csv")

# List to hold all dataframes
dfs = []

# Loop through all files and read them into a dataframe
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    dfs.append(df)

# Concatenate all dataframes into one
combined_df = pd.concat(dfs, axis=0, ignore_index=True)

In [74]:
combined_df['Movie_Name'] = combined_df['name'] + ' ' + combined_df['year'].astype(str)
combined_df = combined_df.rename(columns={'rating': 'imdb_average_rating'})
combined_df = combined_df.drop(columns = ['name', 'year'])
combined_df = combined_df.drop_duplicates(subset=['Movie_Name', 'movie_rated', 'run_length', 'genres', 'release_date', 'imdb_average_rating'])

In [75]:
# Specify the columns you want to append
columns_to_append = ['movie_rated', 'run_length', 'genres', 'release_date', 'imdb_average_rating', 'num_raters', 'num_reviews', 'review_url']

# Merge the DataFrames on 'Movie_Name'
df_merged = pd.merge(df_grouped, combined_df[columns_to_append + ['Movie_Name']], on='Movie_Name', how='left')

In [81]:
# Sort by 'Movie_Name' and length of 'genres' string in descending order
df_merged = df_merged.sort_values(by=['Movie_Name', 'genres'], key=lambda col: col.str.len(), ascending=False)

# Drop duplicates keeping the first (which is the one with the longest 'genres' string)
df_merged = df_merged.drop_duplicates(subset='Movie_Name', keep='first')

In [84]:
df_merged

Unnamed: 0,Movie_Name,review,summary,movie_rated,run_length,genres,release_date,imdb_average_rating,num_raters,num_reviews,review_url
25,Dr. Strangelove or: How I Learned to Stop Worr...,What makes this film so powerful is the messag...,"Peter Sellers, George C. Scott, Sterling Hayde...",PG,1h 35min,Comedy;,29 January 1964 (USA),8.4,438092.0,933.0,https://www.imdb.com/title/tt0057012/reviews/_...
61,Pirates of the Caribbean: The Curse of the Bla...,I am nearly fifty years old. A sober grown man...,"Bob Greene saw ""Pirates of the Caribbean: Dead...",PG-13,2h 23min,Action; Adventure; Fantasy;,9 July 2003 (USA),8.0,997264.0,2186.0,https://www.imdb.com/title/tt0325980/reviews/_...
91,The Lord of the Rings: The Fellowship of the R...,...but oh was I thankful for it!!! All through...,"""I was blown away. Never before have I felt so...",PG-13,2h 58min,Action; Adventure; Drama;,19 December 2001 (USA),8.8,1609165.0,5365.0,https://www.imdb.com/title/tt0120737/reviews/_...
77,Star Wars: Episode V - The Empire Strikes Back...,`It avoids having the standard shoot-'em-up en...,Star Wars: Episode V - The Empire Strikes Back...,PG,2h 4min,Action; Adventure; Fantasy;,20 June 1980 (USA),8.7,1124834.0,1214.0,https://www.imdb.com/title/tt0080684/reviews/_...
92,The Lord of the Rings: The Return of the King ...,Peter Jackson has done it. He has created an ...,Peter Jackson has done it. He has created an ...,PG-13,3h 21min,Adventure; Drama; Fantasy;,17 December 2003 (USA),8.9,1593859.0,3681.0,https://www.imdb.com/title/tt0167260/reviews/_...
...,...,...,...,...,...,...,...,...,...,...,...
69,Rocky 1976,It's hard to explain what this movie means to ...,Rocky is about a poor boxer living his life in...,PG,2h,Drama; Sport;,3 December 1976 (USA),8.1,502752.0,694.0,https://www.imdb.com/title/tt0075148/reviews/_...
42,Heat 1995,Sound like a bold statement? Devotees of class...,Michael Mann's Heat is one of the best cops n'...,R,2h 50min,Crime; Drama; Thriller;,15 December 1995 (USA),8.2,557731.0,1074.0,https://www.imdb.com/title/tt0113277/reviews/_...
20,Coco 2017,Im Mexican and all i can say is Thanks you Piz...,"Coco is a heart-warming experience, filled wit...",PG,1h 45min,Animation; Adventure;,22 November 2017 (USA),8.4,345200.0,1087.0,https://www.imdb.com/title/tt2380307/reviews/_...
46,Jaws 1975,"Jaws is a movie the I grew up with, it's like ...",Jaws is based on the best seller book by Peter...,PG,2h 4min,Adventure; Thriller;,20 June 1975 (USA),8.0,531984.0,1203.0,https://www.imdb.com/title/tt0073195/reviews/_...


In [83]:
# Save the DataFrame to a CSV file
df_merged.to_csv(file_path + 'flask_dataset.csv', index=False)

## 2.3 Keyword Extraction

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [127]:
import pandas as pd
file_path = '/content/drive/My Drive/NLP Final Project/'

df = pd.read_csv(file_path + 'top_100.csv')

In [133]:
# Sort the DataFrame based on 'Movie_Name' and 'helpful' votes
df_sorted = df.sort_values(by=['Movie_Name', 'helpful'], ascending=[True, False])

# Group by 'Movie_Name' and take the top 100 reviews for each movie
top_reviews = df_sorted.groupby('Movie_Name').head(1000)

top_reviews['cleaned_review'] = top_reviews['cleaned_review'].astype('str')

# Concatenate the top 100 reviews for each movie
#df_grouped = top_reviews.groupby('Movie_Name')['cleaned_review'].apply(' '.join).reset_index()
df_grouped = top_reviews.groupby('Movie_Name')['cleaned_review'].apply(' '.join).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_reviews['cleaned_review'] = top_reviews['cleaned_review'].astype('str')


In [154]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

# Download NLTK stop words
nltk.download('stopwords')

# Load NLTK's English stop words
nltk_stop_words = set(stopwords.words('english'))

# Add your custom stop words
custom_stop_words = ['movie', 'film', 'good', 'bad', 'actor', 'play', 'one', 'like']
stop_words = list(nltk_stop_words.union(custom_stop_words))


# Function to extract top keywords or key phrases
def extract_top_terms(text):
    # Initialize TF-IDF Vectorizer with updated stop words and bigrams and trigrams
    vectorizer = TfidfVectorizer(stop_words=stop_words, ngram_range=(2, 2))
    tfidf_matrix = vectorizer.fit_transform([text])

    # Sort terms by their TF-IDF scores
    sorted_items = sorted(zip(vectorizer.get_feature_names_out(), tfidf_matrix.toarray().flatten()), key=lambda x: x[1], reverse=True)

    # Extract top terms - adjust the number as needed
    top_terms = [item[0] for item in sorted_items[:10]]  # Top 10 terms
    return top_terms

# Apply the function to the grouped DataFrame
df_grouped['top_terms'] = df_grouped['cleaned_review'].apply(extract_top_terms)




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [155]:
df_grouped

Unnamed: 0,Movie_Name,cleaned_review,topics,top_terms
0,12 Angry Men 1957,excellent courtroom drama unique twist. instea...,[Topic 1: ın imposing implanted implement impl...,"[angry men, 12 angry, henry fonda, lee cobb, j..."
1,A Walk to Remember 2002,im 17 year old male teenager happened stumble ...,"[Topic 1: movie love film jamie moore, Topic 2...","[mandy moore, shane west, walk remember, love ..."
2,Alien 1979,"back early 20th century, lumière brother didnt...",[Topic 1: época mentionable meor mercenaries m...,"[ridley scott, sigourney weaver, science ficti..."
3,Aliens 1986,scifi movie. nearly perfect every way storylin...,"[Topic 1: âdeal pagan dorky dork dopey, Topic ...","[james cameron, sigourney weaver, bill paxton,..."
4,All Quiet on the Western Front 1930,erich maria remarques novel film made may poss...,[Topic 1: über humane httpswww howlingly hover...,"[western front, quiet western, world war, lew ..."
...,...,...,...,...
95,Toy Story 1995,big fan animated movie coming pixar studios. a...,"[Topic 1: toy movie story film buzz, Topic 2: ...","[toy story, buzz lightyear, tim allen, woody b..."
96,Toy Story 2 1999,"saw toy story 2 hour ago must say that, despit...",[Topic 1: â½ icon iloved illustrating illustra...,"[toy story, buzz lightyear, stinky pete, toy c..."
97,Toy Story 3 2010,"best magic trick world one cannot unraveled, r...",[Topic 1: ðÿ harmonica documenting documentary...,"[toy story, first two, woody buzz, year old, d..."
98,Uri: The Surgical Strike 2019,say one word itd masterpiece. well classic...a...,"[Topic 1: ðÿž humanness hum hue hrithik, Topic...","[indian army, vicky kaushal, surgical strike, ..."


In [156]:
columns_to_append = ['top_terms']

# Merge the DataFrames on 'Movie_Name'
df_merged = pd.merge(df_merged, df_grouped[columns_to_append + ['Movie_Name']], on='Movie_Name', how='left')

In [158]:
# Save the DataFrame to a CSV file
df_merged.to_csv(file_path + 'flask_dataset.csv', index=False)

## 2.4 Sentiment Analysis


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd
file_path = '/content/drive/My Drive/NLP Final Project/'

df = pd.read_csv(file_path + 'top_100.csv')
df_merged = pd.read_csv(file_path + 'flask_dataset.csv')

In [7]:
df

Unnamed: 0,username,rating,helpful,total,date,title,review,Movie_Name,cleaned_review
0,g_cotterell,10.0,822,898,2019-11-16,Puts my faith in the movie business again\n,This is what cinema is supposed to be! Amazing...,Ford v Ferrari 2019,cinema supposed be! amazing chemistry acting l...
1,nickfarf-42699,10.0,518,590,2019-11-15,As close as possible to the truth but!\n,I was at the Philadelphia Film center premiere...,Ford v Ferrari 2019,philadelphia film center premiere 111119 view ...
2,jordanucsd,8.0,47,50,2020-02-14,"""I had no idea. If only my dad were alive to ...","I've seen a lot of car movies, as a self-profe...",Ford v Ferrari 2019,"ive seen lot car movies, selfprofessed car nut..."
3,ymyuseda,10.0,323,382,2019-11-20,Oscar Winning Performance\n,Rating 10/10\nInspired movie of the year. It i...,Ford v Ferrari 2019,rating 1010 inspired movie year. wonderfully m...
4,Kikisaurus,8.0,62,72,2020-02-12,"I know nothing of cars or racing, nor do I ca...","To start off, I have always hated racing. Yes,...",Ford v Ferrari 2019,"start off, always hated racing. yes, hated. th..."
...,...,...,...,...,...,...,...,...,...
117852,dean keaton,10.0,0,1,2000-02-18,a short comment\n,It will not take very long: The Godfather is j...,The Godfather 1972,take long godfather just...the movie. favorite...
117853,rejoefrankel,10.0,0,1,2000-02-16,The greatest film ever made next to the seque...,Much of my male wisdom and philosophy on life ...,The Godfather 1972,much male wisdom philosophy life derived godfa...
117854,jetrock,10.0,0,1,2000-02-11,One of the Greatest stories ever told!\n,What could I possibly say that has not been s...,The Godfather 1972,could possibly say said godfather already said...
117855,Kilroy-17,10.0,0,1,2000-01-28,This movie set the standard for mafia movies....,"Simply put, this movie is the oracle, answer k...",The Godfather 1972,"simply put, movie oracle, answer key, blue pri..."


In [17]:
import pandas as pd
from textblob import TextBlob

# Assuming your DataFrame is named df and has columns 'Movie_Name' and 'Review'

def sentiment_analysis(review):
    # Analyze the sentiment of the review
    return TextBlob(review).sentiment.polarity

df['cleaned_review'] = df['cleaned_review'].astype('str')
# Apply sentiment analysis to each review
df['Sentiment'] = df['cleaned_review'].apply(sentiment_analysis)

# Classify reviews as positive or negative based on sentiment polarity
df['Sentiment_Class'] = df['Sentiment'].apply(lambda x: 'Positive' if x > 0 else ('Negative' if x < 0 else 'Neutral'))

# Group by Movie_Name and calculate the percentage of positive and negative reviews
sentiment_distribution = df.groupby('Movie_Name')['Sentiment_Class'].value_counts(normalize=True).unstack().fillna(0) * 100

print(sentiment_distribution)

Sentiment_Class                       Negative   Neutral   Positive
Movie_Name                                                         
12 Angry Men 1957                     9.350464  1.998572  88.650964
A Walk to Remember 2002               6.533333  0.133333  93.333333
Alien 1979                           15.993266  0.505051  83.501684
Aliens 1986                           9.487666  0.474383  90.037951
All Quiet on the Western Front 1930   7.731959  0.000000  92.268041
...                                        ...       ...        ...
Toy Story 1995                        2.718447  0.582524  96.699029
Toy Story 2 1999                      1.898734  0.632911  97.468354
Toy Story 3 2010                      3.414634  0.975610  95.609756
Uri: The Surgical Strike 2019         5.781585  5.567452  88.650964
Your Name. 2016                       5.789474  1.228070  92.982456

[100 rows x 3 columns]


In [18]:
sentiment_distribution = pd.DataFrame(sentiment_distribution)
sentiment_distribution = sentiment_distribution.reset_index(drop=False)

In [21]:
columns_to_append = ['Negative', 'Neutral', 'Positive']

# Merge the DataFrames on 'Movie_Name'
df_merged = pd.merge(df_merged, sentiment_distribution[columns_to_append + ['Movie_Name']], on='Movie_Name', how='left')

In [23]:
# Save the DataFrame to a CSV file
df_merged.to_csv(file_path + 'flask_dataset.csv', index=False)

## 2.5 Subjectivity Analysis

In TextBlob, subjectivity is a float within the range [0.0, 1.0], where 0.0 is very objective and 1.0 is very subjective.





In [24]:
df

Unnamed: 0,username,rating,helpful,total,date,title,review,Movie_Name,cleaned_review,Sentiment,Sentiment_Class
0,g_cotterell,10.0,822,898,2019-11-16,Puts my faith in the movie business again\n,This is what cinema is supposed to be! Amazing...,Ford v Ferrari 2019,cinema supposed be! amazing chemistry acting l...,0.443750,Positive
1,nickfarf-42699,10.0,518,590,2019-11-15,As close as possible to the truth but!\n,I was at the Philadelphia Film center premiere...,Ford v Ferrari 2019,philadelphia film center premiere 111119 view ...,0.144500,Positive
2,jordanucsd,8.0,47,50,2020-02-14,"""I had no idea. If only my dad were alive to ...","I've seen a lot of car movies, as a self-profe...",Ford v Ferrari 2019,"ive seen lot car movies, selfprofessed car nut...",0.203148,Positive
3,ymyuseda,10.0,323,382,2019-11-20,Oscar Winning Performance\n,Rating 10/10\nInspired movie of the year. It i...,Ford v Ferrari 2019,rating 1010 inspired movie year. wonderfully m...,0.493155,Positive
4,Kikisaurus,8.0,62,72,2020-02-12,"I know nothing of cars or racing, nor do I ca...","To start off, I have always hated racing. Yes,...",Ford v Ferrari 2019,"start off, always hated racing. yes, hated. th...",-0.077778,Negative
...,...,...,...,...,...,...,...,...,...,...,...
117852,dean keaton,10.0,0,1,2000-02-18,a short comment\n,It will not take very long: The Godfather is j...,The Godfather 1972,take long godfather just...the movie. favorite...,0.119643,Positive
117853,rejoefrankel,10.0,0,1,2000-02-16,The greatest film ever made next to the seque...,Much of my male wisdom and philosophy on life ...,The Godfather 1972,much male wisdom philosophy life derived godfa...,0.176944,Positive
117854,jetrock,10.0,0,1,2000-02-11,One of the Greatest stories ever told!\n,What could I possibly say that has not been s...,The Godfather 1972,could possibly say said godfather already said...,0.400000,Positive
117855,Kilroy-17,10.0,0,1,2000-01-28,This movie set the standard for mafia movies....,"Simply put, this movie is the oracle, answer k...",The Godfather 1972,"simply put, movie oracle, answer key, blue pri...",0.108409,Positive


In [25]:
def calculate_subjectivity(review):
    # Create a TextBlob object
    blob = TextBlob(review)
    # Return the subjectivity score
    return blob.sentiment.subjectivity

# Apply the function to calculate subjectivity for each review
df['Subjectivity'] = df['cleaned_review'].apply(calculate_subjectivity)

In [30]:
# Function to classify subjectivity into three groups
def classify_subjectivity(score):
    if score < 0.4:
        return 'Objective'
    elif score < 0.6:
        return 'Somewhat Subjective'
    else:
        return 'Subjective'

# Apply the classification function to the subjectivity scores
df['Subjectivity_Class'] = df['Subjectivity'].apply(classify_subjectivity)

In [31]:
# Count the occurrences of each subjectivity class and normalize to get proportions
subjectivity_proportions = df['Subjectivity_Class'].value_counts(normalize=True)

# Convert proportions to percentages
subjectivity_percentages = subjectivity_proportions * 100

print(subjectivity_percentages)


Somewhat Subjective    59.980315
Subjective             30.606583
Objective               9.413102
Name: Subjectivity_Class, dtype: float64


In [33]:
df

Unnamed: 0,username,rating,helpful,total,date,title,review,Movie_Name,cleaned_review,Sentiment,Sentiment_Class,Subjectivity,Subjectivity_Class
0,g_cotterell,10.0,822,898,2019-11-16,Puts my faith in the movie business again\n,This is what cinema is supposed to be! Amazing...,Ford v Ferrari 2019,cinema supposed be! amazing chemistry acting l...,0.443750,Positive,0.531250,Somewhat Subjective
1,nickfarf-42699,10.0,518,590,2019-11-15,As close as possible to the truth but!\n,I was at the Philadelphia Film center premiere...,Ford v Ferrari 2019,philadelphia film center premiere 111119 view ...,0.144500,Positive,0.426000,Somewhat Subjective
2,jordanucsd,8.0,47,50,2020-02-14,"""I had no idea. If only my dad were alive to ...","I've seen a lot of car movies, as a self-profe...",Ford v Ferrari 2019,"ive seen lot car movies, selfprofessed car nut...",0.203148,Positive,0.526728,Somewhat Subjective
3,ymyuseda,10.0,323,382,2019-11-20,Oscar Winning Performance\n,Rating 10/10\nInspired movie of the year. It i...,Ford v Ferrari 2019,rating 1010 inspired movie year. wonderfully m...,0.493155,Positive,0.589286,Somewhat Subjective
4,Kikisaurus,8.0,62,72,2020-02-12,"I know nothing of cars or racing, nor do I ca...","To start off, I have always hated racing. Yes,...",Ford v Ferrari 2019,"start off, always hated racing. yes, hated. th...",-0.077778,Negative,0.610185,Subjective
...,...,...,...,...,...,...,...,...,...,...,...,...,...
117852,dean keaton,10.0,0,1,2000-02-18,a short comment\n,It will not take very long: The Godfather is j...,The Godfather 1972,take long godfather just...the movie. favorite...,0.119643,Positive,0.503571,Somewhat Subjective
117853,rejoefrankel,10.0,0,1,2000-02-16,The greatest film ever made next to the seque...,Much of my male wisdom and philosophy on life ...,The Godfather 1972,much male wisdom philosophy life derived godfa...,0.176944,Positive,0.602222,Subjective
117854,jetrock,10.0,0,1,2000-02-11,One of the Greatest stories ever told!\n,What could I possibly say that has not been s...,The Godfather 1972,could possibly say said godfather already said...,0.400000,Positive,0.770833,Subjective
117855,Kilroy-17,10.0,0,1,2000-01-28,This movie set the standard for mafia movies....,"Simply put, this movie is the oracle, answer k...",The Godfather 1972,"simply put, movie oracle, answer key, blue pri...",0.108409,Positive,0.479037,Somewhat Subjective


In [34]:
# Save the DataFrame to a CSV file
df.to_csv(file_path + 'top_100.csv', index=False)

## 3.1 Add Top 10 Reviews in Flask Dataset

In [35]:
import pandas as pd
file_path = '/content/drive/My Drive/NLP Final Project/'

df = pd.read_csv(file_path + 'top_100.csv')

In [42]:
import pandas as pd

# Assuming your original DataFrame is named df
# and it has columns 'Movie_Name', 'Review', 'Rating', 'Helpful', 'Total', 'Date', 'Username', 'Title', 'Subjectivity_Class'

# Sort the DataFrame by 'Movie_Name' and 'Helpful' in descending order
df_sorted = df.sort_values(by=['Movie_Name', 'helpful'], ascending=[True, False])

# Function to get the top 10 items of a column for each movie
def top_10_items(series):
    return series.head(10).tolist()

# Group by 'Movie_Name' and aggregate the top 10 reviews' attributes
df_grouped = df_sorted.groupby('Movie_Name').agg({
    'review': top_10_items,
    'rating': top_10_items,
    'helpful': top_10_items,
    'total': top_10_items,
    'date': top_10_items,
    'username': top_10_items,
    'title': top_10_items,
    'Subjectivity_Class': top_10_items,
    'Sentiment_Class': top_10_items
}).reset_index()

# Rename columns to reflect they contain top 10 items
df_grouped.columns = ['Movie_Name', 'Top_10_Reviews', 'Top_10_Reviews_Rating', 'Top_10_Reviews_Helpful', 'Top_10_Reviews_Total', 'Top_10_Reviews_Date', 'Top_10_Reviews_Username', 'Top_10_Reviews_Title', 'Top_10_Reviews_Subjectivity', 'Top_10_Reviews_Sentiment']


In [43]:
df_grouped.head(3)

Unnamed: 0,Movie_Name,Top_10_Reviews,Top_10_Reviews_Rating,Top_10_Reviews_Helpful,Top_10_Reviews_Total,Top_10_Reviews_Date,Top_10_Reviews_Username,Top_10_Reviews_Title,Top_10_Reviews_Subjectivity,Top_10_Reviews_Sentiment
0,12 Angry Men 1957,[An excellent courtroom drama with a unique tw...,"[10.0, 10.0, 9.0, 10.0, 10.0, 9.0, 8.0, 7.0, 1...","[764, 590, 508, 418, 363, 252, 240, 118, 101, 66]","[829, 649, 565, 487, 405, 311, 313, 222, 124, ...","[2000-07-01, 2002-09-18, 2004-10-23, 2005-01-1...","[vukodlak, Andrew Devonshire, juho69, Freddy_L...","[ Excellent\n, No bombs, no car chases but ed...","[Somewhat Subjective, Subjective, Somewhat Sub...","[Positive, Positive, Positive, Positive, Posit..."
1,A Walk to Remember 2002,[I'm a 17 year old male teenager who happened ...,"[10.0, 10.0, 10.0, 7.0, 10.0, 10.0, 1.0, 1.0, ...","[387, 216, 181, 158, 142, 90, 48, 41, 38, 33]","[436, 271, 248, 186, 173, 104, 88, 80, 72, 57]","[2004-11-26, 2002-01-25, 2004-12-21, 2005-05-1...","[cvxfreak, Sambarb337, Makeitcount_meetmeatthe...","[ Succeeds where it counts\n, A moving story ...","[Somewhat Subjective, Somewhat Subjective, Som...","[Positive, Positive, Positive, Positive, Posit..."
2,Alien 1979,"[Back in early 20th century, Lumière brothers ...","[10.0, 9.0, 10.0, 10.0, 9.0, 10.0, 10.0, 10.0,...","[724, 353, 250, 193, 190, 172, 156, 143, 140, ...","[954, 449, 281, 271, 268, 245, 213, 168, 196, ...","[2005-01-31, 2005-02-06, 2018-09-21, 2005-03-3...","[Patuquitos, chrishn, Aaron_Kyle, silsworld, P...","[ The mother of all movies\n, ""Alien"" is not ...","[Somewhat Subjective, Subjective, Subjective, ...","[Positive, Positive, Positive, Positive, Posit..."


In [44]:
df_merged = pd.read_csv(file_path + 'flask_dataset.csv')

columns_to_append = ['Top_10_Reviews', 'Top_10_Reviews_Rating', 'Top_10_Reviews_Helpful', 'Top_10_Reviews_Total', 'Top_10_Reviews_Date', 'Top_10_Reviews_Username', 'Top_10_Reviews_Title', 'Top_10_Reviews_Subjectivity', 'Top_10_Reviews_Sentiment']

# Merge the DataFrames on 'Movie_Name'
df_merged = pd.merge(df_merged, df_grouped[columns_to_append + ['Movie_Name']], on='Movie_Name', how='left')

In [46]:
df_merged.columns

Index(['Movie_Name', 'review', 'summary', 'movie_rated', 'run_length',
       'genres', 'release_date', 'imdb_average_rating', 'num_raters',
       'num_reviews', 'review_url', 'top_terms', 'Negative', 'Neutral',
       'Positive', 'Top_10_Reviews', 'Top_10_Reviews_Rating',
       'Top_10_Reviews_Helpful', 'Top_10_Reviews_Total', 'Top_10_Reviews_Date',
       'Top_10_Reviews_Username', 'Top_10_Reviews_Title',
       'Top_10_Reviews_Subjectivity', 'Top_10_Reviews_Sentiment'],
      dtype='object')

In [47]:
# Save the DataFrame to a CSV file
df_merged.to_csv(file_path + 'flask_dataset.csv', index=False)