# Import Required Libraries

In [20]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import MinMaxScaler

# Load Dataset

In [21]:
# Load the sentimentdataset.csv file into a pandas DataFrame
df = pd.read_csv('rotten_tomatoes_critic_reviews.csv')
df.head()

Unnamed: 0,rotten_tomatoes_link,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content
0,m/0814255,Andrew L. Urban,False,Urban Cinefile,Fresh,,2010-02-06,A fantasy adventure that fuses Greek mythology...
1,m/0814255,Louise Keller,False,Urban Cinefile,Fresh,,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff..."
2,m/0814255,,False,FILMINK (Australia),Fresh,,2010-02-09,With a top-notch cast and dazzling special eff...
3,m/0814255,Ben McEachen,False,Sunday Mail (Australia),Fresh,3.5/5,2010-02-09,Whether audiences will get behind The Lightnin...
4,m/0814255,Ethan Alter,True,Hollywood Reporter,Rotten,,2010-02-10,What's really lacking in The Lightning Thief i...


# Preprocess dataset

In [23]:
# Remove the specified columns
columns_to_remove = ['critic_name', 'top_critic', 'publisher_name', 'review_type', 'review_score']
df = df.drop(columns=columns_to_remove)
df.head()

Unnamed: 0,rotten_tomatoes_link,review_date,review_content
0,m/0814255,2010-02-06,A fantasy adventure that fuses Greek mythology...
1,m/0814255,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff..."
2,m/0814255,2010-02-09,With a top-notch cast and dazzling special eff...
3,m/0814255,2010-02-09,Whether audiences will get behind The Lightnin...
4,m/0814255,2010-02-10,What's really lacking in The Lightning Thief i...


In [24]:
# Extract the year from the 'review_date' column and create a new 'year' column
df['year'] = pd.to_datetime(df['review_date']).dt.year

# Remove the 'review_date' column
df = df.drop(columns=['review_date'])

# Rename the 'review_content' column to 'text'
df = df.rename(columns={'review_content': 'text'})

# Remove rows with NaN values
df = df.dropna()

# Display the first few rows of the modified DataFrame
df.head()

Unnamed: 0,rotten_tomatoes_link,text,year
0,m/0814255,A fantasy adventure that fuses Greek mythology...,2010
1,m/0814255,"Uma Thurman as Medusa, the gorgon with a coiff...",2010
2,m/0814255,With a top-notch cast and dazzling special eff...,2010
3,m/0814255,Whether audiences will get behind The Lightnin...,2010
4,m/0814255,What's really lacking in The Lightning Thief i...,2010


In [25]:
# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Calculate the sentiment for each row in the 'text' column
df['sentiment'] = df['text'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
df.head()

Unnamed: 0,rotten_tomatoes_link,text,year,sentiment
0,m/0814255,A fantasy adventure that fuses Greek mythology...,2010,0.7579
1,m/0814255,"Uma Thurman as Medusa, the gorgon with a coiff...",2010,0.0
2,m/0814255,With a top-notch cast and dazzling special eff...,2010,0.4019
3,m/0814255,Whether audiences will get behind The Lightnin...,2010,0.705
4,m/0814255,What's really lacking in The Lightning Thief i...,2010,-0.5267


In [26]:
# Normalize the sentiment column to be in the interval 0 to 1
scaler = MinMaxScaler()
df['sentiment'] = scaler.fit_transform(df[['sentiment']])
df.head()

Unnamed: 0,rotten_tomatoes_link,text,year,sentiment
0,m/0814255,A fantasy adventure that fuses Greek mythology...,2010,0.881707
1,m/0814255,"Uma Thurman as Medusa, the gorgon with a coiff...",2010,0.499874
2,m/0814255,With a top-notch cast and dazzling special eff...,2010,0.702353
3,m/0814255,Whether audiences will get behind The Lightnin...,2010,0.855056
4,m/0814255,What's really lacking in The Lightning Thief i...,2010,0.234521


In [27]:
# Load the rotten_tomatoes_movies.csv file into a pandas DataFrame
movies_df = pd.read_csv('rotten_tomatoes_movies.csv')
# Keep only the 'rotten_tomatoes_link' and 'movie_title' columns
movies_df = movies_df[['rotten_tomatoes_link', 'movie_title']]
movies_df.head()

Unnamed: 0,rotten_tomatoes_link,movie_title
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...
1,m/0878835,Please Give
2,m/10,10
3,m/1000013-12_angry_men,12 Angry Men (Twelve Angry Men)
4,m/1000079-20000_leagues_under_the_sea,"20,000 Leagues Under The Sea"


In [28]:
# Perform an inner join with df on 'rotten_tomatoes_link'
merged_df = pd.merge(df, movies_df, on='rotten_tomatoes_link', how='inner')
merged_df.head()

Unnamed: 0,rotten_tomatoes_link,text,year,sentiment,movie_title
0,m/0814255,A fantasy adventure that fuses Greek mythology...,2010,0.881707,Percy Jackson & the Olympians: The Lightning T...
1,m/0814255,"Uma Thurman as Medusa, the gorgon with a coiff...",2010,0.499874,Percy Jackson & the Olympians: The Lightning T...
2,m/0814255,With a top-notch cast and dazzling special eff...,2010,0.702353,Percy Jackson & the Olympians: The Lightning T...
3,m/0814255,Whether audiences will get behind The Lightnin...,2010,0.855056,Percy Jackson & the Olympians: The Lightning T...
4,m/0814255,What's really lacking in The Lightning Thief i...,2010,0.234521,Percy Jackson & the Olympians: The Lightning T...


In [29]:
# Remove the 'rotten_tomatoes_link' and 'text' columns
merged_df = merged_df.drop(columns=['rotten_tomatoes_link', 'text'])

# Rename the 'movie_title' column to 'text'
merged_df = merged_df.rename(columns={'movie_title': 'text'})

# Add a new column named 'source' with the value 'rottentomatoes' for all rows
merged_df['source'] = 'rottentomatoes'

# Re-order the columns to this order: source, year, text, sentiment
merged_df = merged_df[['source', 'year', 'text', 'sentiment']]
merged_df.head()

Unnamed: 0,source,year,text,sentiment
0,rottentomatoes,2010,Percy Jackson & the Olympians: The Lightning T...,0.881707
1,rottentomatoes,2010,Percy Jackson & the Olympians: The Lightning T...,0.499874
2,rottentomatoes,2010,Percy Jackson & the Olympians: The Lightning T...,0.702353
3,rottentomatoes,2010,Percy Jackson & the Olympians: The Lightning T...,0.855056
4,rottentomatoes,2010,Percy Jackson & the Olympians: The Lightning T...,0.234521


In [30]:
# Aggregate each 'text' (i.e., movie) by year
aggregated_df = merged_df.groupby(['text', 'year']).agg(
    frequency=('text', 'size'),
    sentiment=('sentiment', 'mean')
).reset_index()

# Add the 'source' column back
aggregated_df['source'] = 'rottentomatoes'

# Re-order the columns to this order: source, year, text, frequency, sentiment
aggregated_df = aggregated_df[['source', 'year', 'text', 'sentiment', 'frequency']]

# Display the first few rows of the aggregated DataFrame
aggregated_df.head()

Unnamed: 0,source,year,text,sentiment,frequency
0,rottentomatoes,2010,!Women Art Revolution,0.817774,1
1,rottentomatoes,2011,!Women Art Revolution,0.66076,21
2,rottentomatoes,2012,!Women Art Revolution,0.857373,1
3,rottentomatoes,2013,!Women Art Revolution,0.748098,1
4,rottentomatoes,2018,!Women Art Revolution,0.852688,1


# Save Preprocessed dataset

In [31]:
# # Save the preprocessed DataFrame to a new CSV file
# Export the DataFrame to a new .tsv file
aggregated_df.to_csv('RottenTomatoes_sentiment.tsv', sep='\t', index=False)