# Detection of Movie Spoilers in Reviews

### ***Import essential libraries***

In [16]:
import pandas as pd
import numpy as np
import re, string, nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import itertools
from collections import Counter
from nltk.stem import WordNetLemmatizer


nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download("stopwords")
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\evich\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\evich\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\evich\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\evich\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\evich\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### Understanding the data

Apories:
1. what is rating_x, rating_y IMDB???
2. maybe convert duration to minutes would be better to compare

In [17]:
# Load the dataset
original_data = pd.read_csv('movie_spoiler_sample.csv')

In [18]:
# First view of the dataset
original_data.head(5)

Unnamed: 0,movie_id,plot_summary,duration,genre,rating_x,release_date,plot_synopsis,review_date,user_id,is_spoiler,review_text,rating_y,review_summary
0,tt0318649,Master explorer and former US Navy Seal Dirk P...,2h 4min,"['Action', 'Adventure', 'Comedy']",6.0,2005-04-08,The film begins with a prologue set in Richmon...,27 May 2005,ur3270789,False,The film starts in the Richmond battle (1865) ...,7,Splendid adventure film with mesmerizing deser...
1,tt0803096,When the world of the Orcs of Draenor is being...,2h 3min,"['Action', 'Adventure', 'Fantasy']",6.9,2016-06-10,"For ages in the region of Middle Earth, humans...",20 June 2016,ur47788388,True,I saw the movie with six friends and we all lo...,10,Epic movie for fans and non fans
2,tt0425112,"Top London cop, PC Nicholas Angel is good. Too...",2h 1min,"['Action', 'Comedy']",7.9,2007-04-20,Nicholas Angel (Simon Pegg) is undoubtedly Lon...,11 June 2007,ur14440242,True,"I enjoyed this movie very much, but it being l...",8,once again a funny British film
3,tt0327056,In the summer of 1975 in a neighborhood in Bos...,2h 18min,"['Crime', 'Drama', 'Mystery']",8.0,2003-10-15,"Three young boys, all the best of friends, are...",5 November 2015,ur63623011,False,What you get here is no more than Clint eastwo...,1,For he made a 'plot twist' out of it
4,tt0259711,"Incarcerated and charged with murder, David Aa...",2h 16min,"['Fantasy', 'Mystery', 'Romance']",6.9,2001-12-14,"David Aames (Tom Cruise) drives to work, he fi...",12 August 2016,ur2781970,False,"I think the first time I saw this movie, I did...",10,As brilliant as I recalled it!


The dataset is very large and run time error occurs therefore we decided to use undersampling to obtain a balanced sample of the data.

In [19]:
# Check the count per movie id
original_data['movie_id'].value_counts()

tt0468569    643
tt0111161    565
tt0167260    328
tt0137523    272
tt0110912    265
            ... 
tt0383534      3
tt0110989      2
tt5580036      2
tt0107719      1
tt0374887      1
Name: movie_id, Length: 1339, dtype: int64

In [20]:
# Check how many unique movies we have
len(original_data['movie_id'].value_counts())

1339

For the above reason we decided to take a sample of at most 4 reviews per movie (or less if they don't have enough reviews).

In [21]:
# Sample each movie's reviews based on the given criteria
sampled_data = original_data.groupby('movie_id', group_keys=False, as_index=False).apply(
    lambda movie_df: movie_df if len(movie_df) <= 4 else pd.concat([
        movie_df[movie_df['is_spoiler'] == True].sample(n=min(2, len(movie_df[movie_df['is_spoiler'] == True])), random_state=42),
        movie_df[movie_df['is_spoiler'] == False].sample(n=min(2, len(movie_df[movie_df['is_spoiler'] == False])), random_state=42)
    ])
).reset_index(drop=True)

sampled_data.head()

Unnamed: 0,movie_id,plot_summary,duration,genre,rating_x,release_date,plot_synopsis,review_date,user_id,is_spoiler,review_text,rating_y,review_summary
0,tt0015864,A lone prospector ventures into Alaska looking...,1h 35min,"['Adventure', 'Comedy', 'Drama']",8.2,1925,It is in the middle of the Gold Rush. A Lone P...,1 February 2006,ur5945598,True,Oh where can I start on why alleged comedians ...,10,Chaplin strikes comic gold!
1,tt0015864,A lone prospector ventures into Alaska looking...,1h 35min,"['Adventure', 'Comedy', 'Drama']",8.2,1925,It is in the middle of the Gold Rush. A Lone P...,11 August 2005,ur5805910,True,"We follow ""the little fellow"" (Chaplin), in hi...",10,Chaplin's delightful bonhomie & innocence
2,tt0015864,A lone prospector ventures into Alaska looking...,1h 35min,"['Adventure', 'Comedy', 'Drama']",8.2,1925,It is in the middle of the Gold Rush. A Lone P...,22 June 2000,ur0773000,False,I recently saw this movie with a live orchestr...,9,"As fresh, funny, and moving as the day it was ..."
3,tt0015864,A lone prospector ventures into Alaska looking...,1h 35min,"['Adventure', 'Comedy', 'Drama']",8.2,1925,It is in the middle of the Gold Rush. A Lone P...,9 March 2009,ur0361658,False,The Gold Rush (1925) was a big undertaking for...,10,The Cinema of Charles Chaplin: The Gold Rush
4,tt0017136,"Sometime in the future, the city of Metropolis...",2h 33min,"['Drama', 'Sci-Fi']",8.3,1927-03-13,"The film is set in the year 2026, in the extra...",7 March 2015,ur13977076,True,This sci-fi classic is set in a future where t...,10,A science fiction classic


In [22]:
# Check the number of records
sampled_data.shape

(5335, 13)

In [23]:
# Check if the sample is balanced
sampled_data['is_spoiler'].value_counts()

False    2677
True     2658
Name: is_spoiler, dtype: int64

In [24]:
sampled_data['whole_review'] = sampled_data['review_text'] + sampled_data['review_summary']

In [25]:
!pip install spacy
!python3 -m spacy download en_core_web_sm




ERROR: Exception:
Traceback (most recent call last):
  File "C:\Users\evich\anaconda3\lib\site-packages\pip\_internal\cli\base_command.py", line 106, in _run_wrapper
    status = _inner_run()
  File "C:\Users\evich\anaconda3\lib\site-packages\pip\_internal\cli\base_command.py", line 97, in _inner_run
    return self.run(options, args)
  File "C:\Users\evich\anaconda3\lib\site-packages\pip\_internal\cli\req_command.py", line 67, in wrapper
    return func(self, options, args)
  File "C:\Users\evich\anaconda3\lib\site-packages\pip\_internal\commands\install.py", line 484, in run
    installed_versions[distribution.canonical_name] = distribution.version
  File "C:\Users\evich\anaconda3\lib\site-packages\pip\_internal\metadata\pkg_resources.py", line 192, in version
    return parse_version(self._dist.version)
  File "C:\Users\evich\anaconda3\lib\site-packages\pip\_vendor\packaging\version.py", line 56, in parse
    return Version(version)
  File "C:\Users\evich\anaconda3\lib\site-packages

In [26]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0

ERROR: Exception:
Traceback (most recent call last):
  File "C:\Users\evich\anaconda3\lib\site-packages\pip\_internal\cli\base_command.py", line 106, in _run_wrapper
    status = _inner_run()
  File "C:\Users\evich\anaconda3\lib\site-packages\pip\_internal\cli\base_command.py", line 97, in _inner_run
    return self.run(options, args)
  File "C:\Users\evich\anaconda3\lib\site-packages\pip\_internal\cli\req_command.py", line 67, in wrapper
    return func(self, options, args)
  File "C:\Users\evich\anaconda3\lib\site-packages\pip\_internal\commands\install.py", line 484, in run
    installed_versions[distribution.canonical_name] = distribution.version
  File "C:\Users\evich\anaconda3\lib\site-packages\pip\_internal\metadata\pkg_resources.py", line 192, in version
    return parse_version(self._dist.version)
  File "C:\Users\evich\anaconda3\lib\site-packages\pip\_vendor\packaging\version.py", line 56, in parse
    return Version(version)
  File "C:\Users\evich\anaconda3\lib\site-packages


  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 12.8/12.8 MB 4.4 MB/s eta 0:00:00


In [27]:


import spacy
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

# Ensure NLTK tokenizer is available
nltk.download('punkt')

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to tokenize text using nltk
def tokenize_text(text):
    return " ".join(word_tokenize(text))  # Convert tokens back to a string for spaCy

# Group reviews by movie_id
grouped_reviews = sampled_data.groupby("movie_id")["review_text"].apply(lambda x: " ".join(x))

# Dictionary to store PERSON entity counts per movie
person_entities_by_movie = {}

# Process each movie's reviews separately
for movie_id, text in grouped_reviews.items():
    tokenized_text = tokenize_text(text)  # Tokenize using NLTK
    doc = nlp(tokenized_text)  # Apply spaCy NER
    person_entities = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]  # Extract only PERSON entities
    
    # Count occurrences of each PERSON entity
    person_counts = Counter(person_entities)
    
    # Store results
    person_entities_by_movie[movie_id] = person_counts

# Print the most common PERSON entities per movie
for movie_id, person_counts in person_entities_by_movie.items():
    print(f"Movie ID: {movie_id} - Most Common PERSON Entities:")
    for entity, count in person_counts.most_common(5):  # Top 5 most common PERSON names
        print(f"{entity}: {count} times")
    print("\n" + "="*40 + "\n")  # Separator for readability


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\evich\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Movie ID: tt0015864 - Most Common PERSON Entities:
Chaplin: 14 times
Tchaykovsky: 1 times
Charley Chaplin: 1 times
Charles Chaplin: 1 times
Charlie Chaplin: 1 times


Movie ID: tt0017136 - Most Common PERSON Entities:
Metropolis: 9 times
Freder: 7 times
Maria: 5 times
Lang: 5 times
Brigitte Helm: 2 times


Movie ID: tt0017925 - Most Common PERSON Entities:
Johnny: 9 times
Buster Keaton: 4 times
Johnny Gray: 1 times
Annabelle Lee: 1 times
Marion Mack: 1 times


Movie ID: tt0018455 - Most Common PERSON Entities:
Murnau: 2 times
O'Brien: 1 times
Sunrise: 1 times
Margaret Livingston: 1 times
Stark: 1 times


Movie ID: tt0019254 - Most Common PERSON Entities:
Jeanne: 5 times
Maria Falconetti: 3 times
Joan: 2 times
Dreyer: 2 times
Carl Dreyer: 1 times


Movie ID: tt0021749 - Most Common PERSON Entities:
Charlie Chaplin: 5 times
Chaplin: 4 times
Virginia Cherrill: 1 times
Charles Chaplin: 1 times
Adam Sandler: 1 times


Movie ID: tt0022100 - Most Common PERSON Entities:
Peter Lorre: 4 times
L

Movie ID: tt0277296 - Most Common PERSON Entities:
Johnson: 2 times
Michael Clark Duncan: 1 times
Cassandra: 1 times
Dwayne Johnson: 1 times
Russell: 1 times


Movie ID: tt0277371 - Most Common PERSON Entities:
Janey: 1 times
MUST: 1 times
Chris Evans: 1 times
Ferris Beuler 's: 1 times


Movie ID: tt0277434 - Most Common PERSON Entities:
Moore: 7 times
Mel Gibson: 5 times
Gibson: 3 times
Mel: 2 times
LTC Hal Moore: 1 times


Movie ID: tt0281358 - Most Common PERSON Entities:
Jamie: 4 times
Mandy Moore: 2 times
Mandy: 1 times
Arthur: 1 times
Dudley Moore: 1 times


Movie ID: tt0285742 - Most Common PERSON Entities:
Ball: 4 times
Peter Boyle: 3 times
Boyle: 3 times
Leticia: 3 times
Halle Berry: 2 times


Movie ID: tt0286106 - Most Common PERSON Entities:
Shyamalan: 2 times
Signs: 2 times
Mel Gibson: 2 times
Hess: 2 times
Achilles Heel: 1 times


Movie ID: tt0286499 - Most Common PERSON Entities:
Marlins: 2 times
Beckham: 1 times
Beckett: 1 times
Kiera Knightley: 1 times
Gurinder Chadha: 

James Gunn: 2 times
Gunn: 2 times
Chris Pratt: 2 times
Lee Pace: 2 times


Movie ID: tt2024544 - Most Common PERSON Entities:
Oscar: 3 times
Solomon: 3 times
Phillips: 1 times
Solomon Northup: 1 times
Pokemon: 1 times


Movie ID: tt2034800 - Most Common PERSON Entities:
Andy Lau: 1 times
Zhang Yimou: 1 times
Matt Damon: 1 times
Pedro Pascal: 1 times
Tian Jing: 1 times


Movie ID: tt2042568 - Most Common PERSON Entities:
Coen: 11 times
Davis: 8 times
Carey Mulligan: 3 times
Llewyn Davis: 3 times
Llewyn: 2 times


Movie ID: tt2084970 - Most Common PERSON Entities:
Alan Turing: 3 times
Benedict Cumberbatch: 1 times
Marjan Rejewski: 1 times
Jerzy Rozycki: 1 times
Henryk Zygalski: 1 times


Movie ID: tt2088003 - Most Common PERSON Entities:
Oskari: 3 times
Sam L. Jackson: 2 times
Samuel L. Jackson: 2 times
Ranger: 2 times
Oscar: 1 times


Movie ID: tt2094766 - Most Common PERSON Entities:
Derek: 2 times
Cal: 2 times
Ezio: 1 times
Michael Fassbender: 1 times
Marion Coltiard: 1 times


Movie 

In [28]:
from collections import Counter

person_entities = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

# Count occurrences of each PERSON entity
person_counts = Counter(person_entities)

# Print most common PERSON entities
print(person_counts.most_common())

[('Oliver', 5), ('Timothee Chalamet', 1), ('Armie Hammer', 1), ('Chalamet', 1), ('Michael Stuhlbarg', 1), ('Elio', 1)]


In [29]:
# View the columns of the dataset
original_data.columns

Index(['movie_id', 'plot_summary', 'duration', 'genre', 'rating_x',
       'release_date', 'plot_synopsis', 'review_date', 'user_id', 'is_spoiler',
       'review_text', 'rating_y', 'review_summary'],
      dtype='object')

In [30]:
# Number of records and features for the dataset
original_data.shape

(72054, 13)

In [31]:
# Check whether the response is balanced
original_data['is_spoiler'].value_counts()

False    36277
True     35777
Name: is_spoiler, dtype: int64

The target variable is balanced.

In [32]:
# Check for duplicated records
original_data.duplicated().any()

False

There are no duplicated records.

In [8]:
# Check the data types
original_data.dtypes

movie_id           object
plot_summary       object
duration           object
genre              object
rating_x          float64
release_date       object
plot_synopsis      object
review_date        object
user_id            object
is_spoiler           bool
review_text        object
rating_y            int64
review_summary     object
dtype: object

In [9]:
#remove starting and trailing white spaces from strings
columns_to_strip = ['movie_id','plot_summary','duration','genre','release_date','plot_synopsis',
                            'review_date','user_id','review_text','review_summary']
original_data[columns_to_strip] = original_data[columns_to_strip].apply(lambda col: col.str.strip())

In [10]:
# Check for missing values
original_data.isna().any()

movie_id          False
plot_summary      False
duration          False
genre             False
rating_x          False
release_date      False
plot_synopsis     False
review_date       False
user_id           False
is_spoiler        False
review_text       False
rating_y          False
review_summary    False
dtype: bool

There are no missing values.

In [11]:
# Check for empty strings
(original_data == '').sum()

movie_id          0
plot_summary      0
duration          0
genre             0
rating_x          0
release_date      0
plot_synopsis     0
review_date       0
user_id           0
is_spoiler        0
review_text       0
rating_y          0
review_summary    0
dtype: int64

There are no empty strings.

In [12]:
#replace empty strings with NA
# original_data[columns_to_strip] = original_data[columns_to_strip].replace('', np.nan)

In [13]:
genre = original_data['genre']
genre

0         ['Action', 'Adventure', 'Comedy']
1        ['Action', 'Adventure', 'Fantasy']
2                      ['Action', 'Comedy']
3             ['Crime', 'Drama', 'Mystery']
4         ['Fantasy', 'Mystery', 'Romance']
                        ...                
72049                            ['Comedy']
72050                 ['Comedy', 'Fantasy']
72051     ['Biography', 'Drama', 'History']
72052     ['Adventure', 'Comedy', 'Family']
72053     ['Action', 'Adventure', 'Sci-Fi']
Name: genre, Length: 72054, dtype: object

In [14]:
# Check if every element of genre feature is a string in order to make them lists
original_data['genre'].apply(lambda x: isinstance(x, str)).all()

True

In [15]:
# split the genres and remove the "[" "]" characters
original_data["genre"] = original_data["genre"].str.split(",")

original_data_exploded = original_data.explode('genre')
original_data_exploded['genre'] = original_data_exploded['genre'].str.replace(r"[\[\]' ]", "", regex=True)

# Apply one-hot encoding using get_dummies
original_data_encoded = pd.get_dummies(original_data_exploded['genre'])

# merge with the original dataframe
original_data_final = original_data.merge(original_data_encoded.groupby(original_data_exploded.index).sum(), left_index= True, right_index = True)

#  Drop the original 'genre' column if it's no longer needed
original_data_final = original_data_final.drop(columns=['genre'])
original_data =original_data_final

original_data

Unnamed: 0,movie_id,plot_summary,duration,rating_x,release_date,plot_synopsis,review_date,user_id,is_spoiler,review_text,...,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,tt0318649,Master explorer and former US Navy Seal Dirk P...,2h 4min,6.0,2005-04-08,The film begins with a prologue set in Richmon...,27 May 2005,ur3270789,False,The film starts in the Richmond battle (1865) ...,...,0,0,0,0,0,0,0,0,0,0
1,tt0803096,When the world of the Orcs of Draenor is being...,2h 3min,6.9,2016-06-10,"For ages in the region of Middle Earth, humans...",20 June 2016,ur47788388,True,I saw the movie with six friends and we all lo...,...,0,0,0,0,0,0,0,0,0,0
2,tt0425112,"Top London cop, PC Nicholas Angel is good. Too...",2h 1min,7.9,2007-04-20,Nicholas Angel (Simon Pegg) is undoubtedly Lon...,11 June 2007,ur14440242,True,"I enjoyed this movie very much, but it being l...",...,0,0,0,0,0,0,0,0,0,0
3,tt0327056,In the summer of 1975 in a neighborhood in Bos...,2h 18min,8.0,2003-10-15,"Three young boys, all the best of friends, are...",5 November 2015,ur63623011,False,What you get here is no more than Clint eastwo...,...,0,0,0,1,0,0,0,0,0,0
4,tt0259711,"Incarcerated and charged with murder, David Aa...",2h 16min,6.9,2001-12-14,"David Aames (Tom Cruise) drives to work, he fi...",12 August 2016,ur2781970,False,"I think the first time I saw this movie, I did...",...,0,0,0,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72049,tt0829482,"Seth and Evan are best friends, inseparable, n...",1h 53min,7.6,2007-08-17,Seth (Jonah Hill) and Evan (Michael Cera) are ...,18 February 2012,ur31632708,False,I had a very unpleasant experience watching Su...,...,0,0,0,0,0,0,0,0,0,0
72050,tt0101272,The Addams step out of Charles Addams' cartoon...,1h 39min,6.8,1991-11-22,Gomez Addams (Raúl Juliá) laments the 25-year ...,18 August 2007,ur13887584,True,"""The Addams Family"", while being entertaining,...",...,0,0,0,0,0,0,0,0,0,0
72051,tt0338751,"Biopic of billionaire Howard Hughes, starting ...",2h 50min,7.5,2004-12-25,The Aviator has no opening credits other than ...,27 January 2005,ur0180277,False,The story was interesting and the cinematograp...,...,0,0,0,0,0,0,0,0,0,0
72052,tt0367594,When Willy Wonka decides to let five children ...,1h 55min,6.7,2005-07-15,Willy Wonka (Johnny Depp) has built the greate...,31 August 2010,ur2532491,True,"To my way of thinking, if you're going to rema...",...,0,0,0,0,0,0,0,0,0,0


In [16]:
original_data.columns

Index(['movie_id', 'plot_summary', 'duration', 'rating_x', 'release_date',
       'plot_synopsis', 'review_date', 'user_id', 'is_spoiler', 'review_text',
       'rating_y', 'review_summary', 'Action', 'Adventure', 'Animation',
       'Biography', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy',
       'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery',
       'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western'],
      dtype='object')

The number of records does not change.

In [17]:
# Convert duration to minutes

def convert_to_minutes(duration):
    parts = duration.replace("h", "").replace("min", "").strip().split()
    
    hours = int(parts[0]) if "h" in duration else 0
    minutes = int(parts[1]) if len(parts) > 1 else 0
    
    return hours * 60 + minutes

original_data['duration_minutes'] = original_data['duration'].apply(convert_to_minutes)
original_data

Unnamed: 0,movie_id,plot_summary,duration,rating_x,release_date,plot_synopsis,review_date,user_id,is_spoiler,review_text,...,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western,duration_minutes
0,tt0318649,Master explorer and former US Navy Seal Dirk P...,2h 4min,6.0,2005-04-08,The film begins with a prologue set in Richmon...,27 May 2005,ur3270789,False,The film starts in the Richmond battle (1865) ...,...,0,0,0,0,0,0,0,0,0,124
1,tt0803096,When the world of the Orcs of Draenor is being...,2h 3min,6.9,2016-06-10,"For ages in the region of Middle Earth, humans...",20 June 2016,ur47788388,True,I saw the movie with six friends and we all lo...,...,0,0,0,0,0,0,0,0,0,123
2,tt0425112,"Top London cop, PC Nicholas Angel is good. Too...",2h 1min,7.9,2007-04-20,Nicholas Angel (Simon Pegg) is undoubtedly Lon...,11 June 2007,ur14440242,True,"I enjoyed this movie very much, but it being l...",...,0,0,0,0,0,0,0,0,0,121
3,tt0327056,In the summer of 1975 in a neighborhood in Bos...,2h 18min,8.0,2003-10-15,"Three young boys, all the best of friends, are...",5 November 2015,ur63623011,False,What you get here is no more than Clint eastwo...,...,0,0,1,0,0,0,0,0,0,138
4,tt0259711,"Incarcerated and charged with murder, David Aa...",2h 16min,6.9,2001-12-14,"David Aames (Tom Cruise) drives to work, he fi...",12 August 2016,ur2781970,False,"I think the first time I saw this movie, I did...",...,0,0,1,1,0,0,0,0,0,136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72049,tt0829482,"Seth and Evan are best friends, inseparable, n...",1h 53min,7.6,2007-08-17,Seth (Jonah Hill) and Evan (Michael Cera) are ...,18 February 2012,ur31632708,False,I had a very unpleasant experience watching Su...,...,0,0,0,0,0,0,0,0,0,113
72050,tt0101272,The Addams step out of Charles Addams' cartoon...,1h 39min,6.8,1991-11-22,Gomez Addams (Raúl Juliá) laments the 25-year ...,18 August 2007,ur13887584,True,"""The Addams Family"", while being entertaining,...",...,0,0,0,0,0,0,0,0,0,99
72051,tt0338751,"Biopic of billionaire Howard Hughes, starting ...",2h 50min,7.5,2004-12-25,The Aviator has no opening credits other than ...,27 January 2005,ur0180277,False,The story was interesting and the cinematograp...,...,0,0,0,0,0,0,0,0,0,170
72052,tt0367594,When Willy Wonka decides to let five children ...,1h 55min,6.7,2005-07-15,Willy Wonka (Johnny Depp) has built the greate...,31 August 2010,ur2532491,True,"To my way of thinking, if you're going to rema...",...,0,0,0,0,0,0,0,0,0,115


In [18]:
original_data.columns

Index(['movie_id', 'plot_summary', 'duration', 'rating_x', 'release_date',
       'plot_synopsis', 'review_date', 'user_id', 'is_spoiler', 'review_text',
       'rating_y', 'review_summary', 'Action', 'Adventure', 'Animation',
       'Biography', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy',
       'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery',
       'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western',
       'duration_minutes'],
      dtype='object')

In [19]:
# Rename rating_x, rating_y
original_data.rename(columns={'rating_x': 'IMDB_rating', 'rating_y': 'user_rating'}, inplace=True)

In [20]:
# Create new features that only have the year of release and the year of the review

original_data['release_year'] = original_data['release_date'].str[:4]
original_data['review_year'] = original_data['review_date'].str[-4:]

original_data

Unnamed: 0,movie_id,plot_summary,duration,IMDB_rating,release_date,plot_synopsis,review_date,user_id,is_spoiler,review_text,...,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western,duration_minutes,release_year,review_year
0,tt0318649,Master explorer and former US Navy Seal Dirk P...,2h 4min,6.0,2005-04-08,The film begins with a prologue set in Richmon...,27 May 2005,ur3270789,False,The film starts in the Richmond battle (1865) ...,...,0,0,0,0,0,0,0,124,2005,2005
1,tt0803096,When the world of the Orcs of Draenor is being...,2h 3min,6.9,2016-06-10,"For ages in the region of Middle Earth, humans...",20 June 2016,ur47788388,True,I saw the movie with six friends and we all lo...,...,0,0,0,0,0,0,0,123,2016,2016
2,tt0425112,"Top London cop, PC Nicholas Angel is good. Too...",2h 1min,7.9,2007-04-20,Nicholas Angel (Simon Pegg) is undoubtedly Lon...,11 June 2007,ur14440242,True,"I enjoyed this movie very much, but it being l...",...,0,0,0,0,0,0,0,121,2007,2007
3,tt0327056,In the summer of 1975 in a neighborhood in Bos...,2h 18min,8.0,2003-10-15,"Three young boys, all the best of friends, are...",5 November 2015,ur63623011,False,What you get here is no more than Clint eastwo...,...,1,0,0,0,0,0,0,138,2003,2015
4,tt0259711,"Incarcerated and charged with murder, David Aa...",2h 16min,6.9,2001-12-14,"David Aames (Tom Cruise) drives to work, he fi...",12 August 2016,ur2781970,False,"I think the first time I saw this movie, I did...",...,1,1,0,0,0,0,0,136,2001,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72049,tt0829482,"Seth and Evan are best friends, inseparable, n...",1h 53min,7.6,2007-08-17,Seth (Jonah Hill) and Evan (Michael Cera) are ...,18 February 2012,ur31632708,False,I had a very unpleasant experience watching Su...,...,0,0,0,0,0,0,0,113,2007,2012
72050,tt0101272,The Addams step out of Charles Addams' cartoon...,1h 39min,6.8,1991-11-22,Gomez Addams (Raúl Juliá) laments the 25-year ...,18 August 2007,ur13887584,True,"""The Addams Family"", while being entertaining,...",...,0,0,0,0,0,0,0,99,1991,2007
72051,tt0338751,"Biopic of billionaire Howard Hughes, starting ...",2h 50min,7.5,2004-12-25,The Aviator has no opening credits other than ...,27 January 2005,ur0180277,False,The story was interesting and the cinematograp...,...,0,0,0,0,0,0,0,170,2004,2005
72052,tt0367594,When Willy Wonka decides to let five children ...,1h 55min,6.7,2005-07-15,Willy Wonka (Johnny Depp) has built the greate...,31 August 2010,ur2532491,True,"To my way of thinking, if you're going to rema...",...,0,0,0,0,0,0,0,115,2005,2010


In [21]:
# Convert True to 1 and False to 0 in the response variable
original_data['is_spoiler'] = original_data['is_spoiler'].astype(int)
original_data['is_spoiler']

0        0
1        1
2        1
3        0
4        0
        ..
72049    0
72050    1
72051    0
72052    1
72053    1
Name: is_spoiler, Length: 72054, dtype: int32

### Text Pre-processing

In [22]:
# Create new dataframe for spoiler and not spoiler class
df_spoiler = original_data[original_data['is_spoiler'] == 1]
df_not_spoiler = original_data[original_data['is_spoiler'] == 0]

#### Uncontract

In [23]:
# Define the uncontract function which converts short forms to the full word. For example: 'isn't' to 'is not'.
def uncontract(text):
    text = re.sub(r"(\b)([Aa]re|[Cc]ould|[Dd]id|[Dd]oes|[Dd]o|[Hh]ad|[Hh]as|[Hh]ave|[Ii]s|[Mm]ight|[Mm]ust|[Ss]hould|[Ww]ere|[Ww]ould)n't", r"\1\2 not", text)
    text = re.sub(r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'ll", r"\1\2 will", text)
    text = re.sub(r"(\b)([Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'re", r"\1\2 are", text)
    text = re.sub(r"(\b)([Ii]|[Ss]hould|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Ww]ould|[Yy]ou)'ve", r"\1\2 have", text)

    text = re.sub(r"(\b)([Cc]a)n't", r"\1\2n not", text)
    text = re.sub(r"(\b)([Ii])'m", r"\1\2 am", text)
    text = re.sub(r"(\b)([Ll]et)'s", r"\1\2 us", text)
    text = re.sub(r"(\b)([Tt]here)'s", r"\1\2 is", text)
    text = re.sub(r"(\b)([Ww])on't", r"\1\2ill not", text)
    text = re.sub(r"(\b)([Ss])han't", r"\1\2hall not", text)
    text = re.sub(r"(\b)([Yy])(?:'all|a'll)", r"\1\2ou all", text)

    return text

In [24]:
original_data['review_text'] = [uncontract(t) for t in original_data['review_text']]
original_data['review_summary'] = [uncontract(t) for t in original_data['review_summary']]

In [25]:
# Create new dataframe for spoiler and not spoiler class
df_spoiler = original_data[original_data['is_spoiler'] == 1]
df_not_spoiler = original_data[original_data['is_spoiler'] == 0]

#### Tokenization

In [26]:
spoiler_tokens_text = [word_tokenize(t) for t in df_spoiler['review_text']]
spoiler_tokens_summary = [word_tokenize(t) for t in df_spoiler['review_summary']]

not_spoiler_tokens_text = [word_tokenize(t) for t in df_not_spoiler['review_text']]
not_spoiler_tokens_summary = [word_tokenize(t) for t in df_not_spoiler['review_summary']]

In [27]:
# Convert to a single list
spoiler_tokens_text = list(itertools.chain.from_iterable(spoiler_tokens_text))
spoiler_tokens_summary = list(itertools.chain.from_iterable(spoiler_tokens_summary))

not_spoiler_tokens_text = list(itertools.chain.from_iterable(not_spoiler_tokens_text))
not_spoiler_tokens_summary = list(itertools.chain.from_iterable(not_spoiler_tokens_summary))

spoiler_tokens = spoiler_tokens_text + spoiler_tokens_summary
not_spoiler_tokens = not_spoiler_tokens_text + not_spoiler_tokens_summary

In [28]:
# Check the number of unique tokens
print('Number of Unique Spoiler Tokens:', len(set(spoiler_tokens)))
print('Number of Unique Non-spoiler Tokens:', len(set(not_spoiler_tokens)))

Number of Unique Spoiler Tokens: 215891
Number of Unique Non-spoiler Tokens: 171314


In [29]:
# Check the most frequent tokens for Spoilers and Non-spoilers
print('Spoiler Tokens (full text):\n')

for t, f in Counter(spoiler_tokens_text).most_common(10):

    print('{0:25} {1}'.format(t, f))

print()
#------------------------------------
print('Spoiler Tokens (summary):\n')

for t, f in Counter(spoiler_tokens_summary).most_common(10):

    print('{0:25} {1}'.format(t, f))

print()
#------------------------------------
print('Non Spoiler Tokens (full text):\n')

for t, f in Counter(not_spoiler_tokens_text).most_common(10):

    print('{0:25} {1}'.format(t, f))

print()
#------------------------------------
print('Non Spoiler Tokens (summary):\n')

for t, f in Counter(not_spoiler_tokens_summary).most_common(10):

    print('{0:25} {1}'.format(t, f))

print()

Spoiler Tokens (full text):

the                       640924
,                         572117
.                         488514
and                       334458
a                         305482
of                        291265
to                        289788
is                        230213
in                        177901
I                         162801

Spoiler Tokens (summary):

,                         7585
the                       7166
!                         7162
.                         6725
of                        5517
a                         4871
and                       4112
movie                     3711
A                         3605
to                        3048

Non Spoiler Tokens (full text):

the                       429904
,                         401464
.                         359175
and                       243554
a                         220162
of                        210827
to                        192991
is                        172367
I    

In [30]:

def dimensionality_info(spoilertext ,spoilersummary ,nonspoilertext , nonspoilersummary):

    print('Number of Spoiler Text Tokens:    ', len(spoilertext),     '/', len(spoiler_tokens_text))
    print('Number of Spoiler Summary Tokens:', len(spoilersummary), '/', len(spoiler_tokens_summary))
    print('Number of Spoiler Summary Tokens:', len(nonspoilertext), '/', len(not_spoiler_tokens_text))
    print('Number of Spoiler Summary Tokens:', len(nonspoilersummary), '/', len(not_spoiler_tokens_summary))

    print()

    print('Number of Unique Spoiler Text Tokens:    ', len(set(spoilertext)),     '/', len(spoiler_tokens_text))
    print('Number of Unique Spoiler Summary Tokens:', len(set(spoilersummary)), '/', len(spoiler_tokens_summary))
    print('Number of Unique Spoiler Summary Tokens:', len(set(nonspoilertext)), '/', len(not_spoiler_tokens_text))
    print('Number of Unique Spoiler Summary Tokens:', len(set(nonspoilersummary)), '/', len(not_spoiler_tokens_summary))

    print()
    print('|V|/|Tokens|:            ', len(set(spoilertext)) / len(set(spoiler_tokens_text)))
    print('|V|/|Tokens|:            ', len(set(spoilersummary)) / len(set(spoiler_tokens_summary)))
    print('|V|/|Tokens|:            ', len(set(nonspoilertext)) / len(set(not_spoiler_tokens_text)))
    print('|V|/|Tokens|:            ', len(set(nonspoilersummary)) / len(set(not_spoiler_tokens_summary)))

We observe that the most frequent tokens are stopwords. For that reason we will remvove them to get more informative result.

#### Remove Punctuation and Stopwords

##### Remove Punctuation

In [31]:
spoiler_punctuations_text = list(
    filter(
        lambda t: all(c.isdigit() or c in string.punctuation for c in t),
        spoiler_tokens_text
    )
)
#--------------------------------------
spoiler_punctuations_summary = list(
    filter(
        lambda t: all(c.isdigit() or c in string.punctuation for c in t),
        spoiler_tokens_summary
    )
)
#--------------------------------------
non_spoiler_punctuations_text = list(
    filter(
        lambda t: all(c.isdigit() or c in string.punctuation for c in t),
        not_spoiler_tokens_text
    )
)
#--------------------------------------
non_spoiler_punctuations_summary = list(
    filter(
        lambda t: all(c.isdigit() or c in string.punctuation for c in t),
        not_spoiler_tokens_summary
    )
)

In [32]:
spoiler_punctuations_text = set(spoiler_punctuations_text)
spoiler_punctuations_summary = set(spoiler_punctuations_summary)

non_spoiler_punctuations_text = set(non_spoiler_punctuations_text)
non_spoiler_punctuations_summary = set(non_spoiler_punctuations_summary)

In [33]:
# Remove punctuation
spoiler_text_clean_tokens = [t for t in spoiler_tokens_text if t not in spoiler_punctuations_text]
spoiler_summary_clean_tokens = [t for t in spoiler_tokens_summary if t not in spoiler_punctuations_summary]

non_spoiler_text_clean_tokens = [t for t in not_spoiler_tokens_text if t not in non_spoiler_punctuations_text]
non_spoiler_summary_clean_tokens = [t for t in not_spoiler_tokens_summary if t not in non_spoiler_punctuations_summary]

##### Remove Stopwords

In [34]:
stop_word_regex = '|'.join(['^{}$'.format(s) for s in stopwords.words('english')])

stop_word_regex = re.compile(stop_word_regex)

In [35]:
spoiler_stop_words_text = list(filter(stop_word_regex.match, spoiler_text_clean_tokens))
spoiler_stop_words_summary = list(filter(stop_word_regex.match, spoiler_summary_clean_tokens))

non_spoiler_stop_words_text = list(filter(stop_word_regex.match, non_spoiler_text_clean_tokens))
non_spoiler_stop_words_summary = list(filter(stop_word_regex.match, non_spoiler_summary_clean_tokens))
#----------------------------------------------------
spoiler_stop_words_text = set(spoiler_stop_words_text)
spoiler_stop_words_summary = set(spoiler_stop_words_summary)

non_spoiler_stop_words_text = set(non_spoiler_stop_words_text)
non_spoiler_stop_words_text = set(non_spoiler_stop_words_text)

In [36]:
# Remove stopwords
spoiler_text_clean_tokens = [t for t in spoiler_text_clean_tokens if t not in spoiler_stop_words_text]
spoiler_summary_clean_tokens = [t for t in spoiler_summary_clean_tokens if t not in spoiler_stop_words_summary]

non_spoiler_text_clean_tokens = [t for t in non_spoiler_text_clean_tokens if t not in non_spoiler_stop_words_text]
non_spoiler_summary_clean_tokens = [t for t in non_spoiler_summary_clean_tokens if t not in non_spoiler_stop_words_text]

Check the most frequent tokens after removing punctuation and stopwords.

In [37]:
# Check again the most frequent tokens for Spoilers and Non-spoilers
print('Clean Spoiler Tokens (full text):\n')

for t, f in Counter(spoiler_text_clean_tokens).most_common(10):

    print('{0:25} {1}'.format(t, f))

print()
#------------------------------------
print('Clean Spoiler Tokens (summary):\n')

for t, f in Counter(spoiler_summary_clean_tokens).most_common(10):

    print('{0:25} {1}'.format(t, f))

print()
#------------------------------------
print('Clean Non Spoiler Tokens (full text):\n')

for t, f in Counter(non_spoiler_text_clean_tokens).most_common(10):

    print('{0:25} {1}'.format(t, f))

print()
#------------------------------------
print('Clean Non Spoiler Tokens (summary):\n')

for t, f in Counter(non_spoiler_summary_clean_tokens).most_common(10):

    print('{0:25} {1}'.format(t, f))

print()

Clean Spoiler Tokens (full text):

I                         162801
's                        116815
movie                     98547
The                       79859
film                      74521
one                       44210
like                      40385
It                        33041
would                     28817
good                      28745

Clean Spoiler Tokens (summary):

movie                     3711
A                         3605
The                       2870
I                         2403
's                        2132
film                      1832
good                      1288
best                      999
Movie                     968
Not                       951

Clean Non Spoiler Tokens (full text):

I                         129352
movie                     83875
's                        80788
film                      62501
The                       59081
one                       34133
like                      29159
It                        27425
good 

We can observe that the possesive s does not give much information. Therefore we decided to consider it as a stopword and remove it.

In [38]:
spoiler_text_clean_tokens = [t for t in spoiler_text_clean_tokens if t != "'s"]
spoiler_summary_clean_tokens = [t for t in spoiler_summary_clean_tokens if t != "'s"]

non_spoiler_text_clean_tokens = [t for t in non_spoiler_text_clean_tokens if t != "'s"]
non_spoiler_summary_clean_tokens = [t for t in non_spoiler_summary_clean_tokens if t != "'s"]

In [39]:
######eminame sto lemmatization stemming lab 2

Lowercasing

In [40]:
#lowercasing

spoiler_text_clean_tokens     = [t.lower() for t in spoiler_text_clean_tokens]
spoiler_summary_clean_tokens = [t.lower() for t in spoiler_summary_clean_tokens]

non_spoiler_text_clean_tokens     = [t.lower() for t in non_spoiler_text_clean_tokens]
non_spoiler_summary_clean_tokens = [t.lower() for t in non_spoiler_summary_clean_tokens]

Stemming

In [41]:
ps = PorterStemmer()
stems_spoiler_text_clean_tokens    = [ps.stem(t) for t in spoiler_text_clean_tokens]
stems_spoiler_summary_clean_tokens    = [ps.stem(t) for t in spoiler_summary_clean_tokens]
stems_non_spoiler_text_clean_tokens    = [ps.stem(t) for t in non_spoiler_text_clean_tokens]
stems_non_spoiler_summary_clean_tokens    = [ps.stem(t) for t in non_spoiler_summary_clean_tokens]


In [42]:
print('Spoiler Text Stems:\n')

for t, f in Counter(stems_spoiler_text_clean_tokens).most_common(10):
    print('{0:25} {1}'.format(t, f))

print()
#-----------------------------
print('Spoiler Summary Stems:\n')

for t, f in Counter(stems_spoiler_summary_clean_tokens).most_common(10): 
    print('{0:25} {1}'.format(t, f))
    
print()
#-----------------------------
print('Non Spoiler Text Stems:\n')

for t, f in Counter(stems_non_spoiler_text_clean_tokens).most_common(10):
    print('{0:25} {1}'.format(t, f))

print()
#-----------------------------
print('Non Spoiler Summary Stems:\n')

for t, f in Counter(stems_non_spoiler_summary_clean_tokens).most_common(10): 
    print('{0:25} {1}'.format(t, f))
    
print()

Spoiler Text Stems:

i                         162813
movi                      115564
film                      88309
the                       81729
one                       50033
like                      47986
charact                   38686
it                        34810
time                      31561
good                      30349

Spoiler Summary Stems:

movi                      5225
a                         3605
the                       2922
film                      2794
i                         2403
good                      2166
great                     1789
best                      1599
one                       1462
not                       1024

Non Spoiler Text Stems:

i                         129355
movi                      99690
film                      75037
the                       60283
one                       38227
like                      34683
it                        28995
charact                   27152
good                      25166
time   

In [43]:
dimensionality_info(stems_spoiler_text_clean_tokens, stems_spoiler_summary_clean_tokens,
                    stems_non_spoiler_text_clean_tokens,stems_non_spoiler_summary_clean_tokens)

Number of Spoiler Text Tokens:     6674350 / 13769910
Number of Spoiler Summary Tokens: 156035 / 260104
Number of Spoiler Summary Tokens: 4712969 / 9677049
Number of Spoiler Summary Tokens: 150627 / 250158

Number of Unique Spoiler Text Tokens:     152683 / 13769910
Number of Unique Spoiler Summary Tokens: 12375 / 260104
Number of Unique Spoiler Summary Tokens: 118423 / 9677049
Number of Unique Spoiler Summary Tokens: 11613 / 250158

|V|/|Tokens|:             0.7193138668249616
|V|/|Tokens|:             0.5638840791032534
|V|/|Tokens|:             0.7057263576932474
|V|/|Tokens|:             0.5630272471637738


Lemmatising

In [44]:
lemmatizer = WordNetLemmatizer()
lemmas_spoiler_text_clean_tokens    = [lemmatizer.lemmatize(t) for t in spoiler_text_clean_tokens]
lemmas_spoiler_summary_clean_tokens    = [lemmatizer.lemmatize(t) for t in spoiler_summary_clean_tokens]
lemmas_non_spoiler_text_clean_tokens    = [lemmatizer.lemmatize(t) for t in non_spoiler_text_clean_tokens]
lemmas_non_spoiler_summary_clean_tokens    = [lemmatizer.lemmatize(t) for t in non_spoiler_summary_clean_tokens]


In [45]:
print('Spoiler Text Lemmas:\n')

for t, f in Counter(lemmas_spoiler_text_clean_tokens).most_common(10):
    print('{0:25} {1}'.format(t, f))

print()
#-----------------------------
print('Spoiler Summary Lemmas:\n')

for t, f in Counter(lemmas_spoiler_summary_clean_tokens).most_common(10): 
    print('{0:25} {1}'.format(t, f))
    
print()
#-----------------------------
print('Non Spoiler Text Lemmas:\n')

for t, f in Counter(lemmas_non_spoiler_text_clean_tokens).most_common(10):
    print('{0:25} {1}'.format(t, f))

print()
#-----------------------------
print('Non Spoiler Summary Lemmas:\n')

for t, f in Counter(lemmas_non_spoiler_summary_clean_tokens).most_common(10): 
    print('{0:25} {1}'.format(t, f))
    
print()

Spoiler Text Lemmas:

i                         162820
movie                     115562
film                      87090
the                       81728
one                       50031
like                      42819
character                 38684
it                        34810
time                      31225
good                      30092

Spoiler Summary Lemmas:

movie                     5225
a                         3763
the                       2922
film                      2784
i                         2403
good                      2159
great                     1768
best                      1599
one                       1462
not                       1024

Non Spoiler Text Lemmas:

i                         129359
movie                     99689
film                      73940
the                       60283
one                       38226
like                      30920
it                        28995
character                 27150
good                      24989
time

In [46]:
dimensionality_info(lemmas_spoiler_text_clean_tokens, lemmas_spoiler_summary_clean_tokens,
                    lemmas_non_spoiler_text_clean_tokens,lemmas_non_spoiler_summary_clean_tokens)

Number of Spoiler Text Tokens:     6674350 / 13769910
Number of Spoiler Summary Tokens: 156035 / 260104
Number of Spoiler Summary Tokens: 4712969 / 9677049
Number of Spoiler Summary Tokens: 150627 / 250158

Number of Unique Spoiler Text Tokens:     174773 / 13769910
Number of Unique Spoiler Summary Tokens: 14867 / 260104
Number of Unique Spoiler Summary Tokens: 136946 / 9677049
Number of Unique Spoiler Summary Tokens: 13894 / 250158

|V|/|Tokens|:             0.8233833658403295
|V|/|Tokens|:             0.6774355235578238
|V|/|Tokens|:             0.8161117500879007
|V|/|Tokens|:             0.6736158246872879


In [None]:
#Many of the spoiler and non-spoiler most common words are the same - therefore we can try to isolate the 
#most common words that appear *only* in spoiler reviews and *only* in negative reviews

In [55]:
##############################################
# Define an informative function to view the #
# information of frequencies between the two #
# sets of tokens passed as input.            #
##############################################

def comparison_info(spoiler, nonspoiler, desc=None, n=10):

    print(f'{desc.capitalize()} only in Spoiler reviews:')
    print()

    spoilerfreq = dict(Counter(spoiler))
    nonspoilerfreq = dict(Counter(nonspoiler))

    onlyspoiler = set(spoilerfreq).difference(set(nonspoilerfreq))
    onlynonspoiler = set(nonspoilerfreq).difference(set(spoilerfreq))
    common   = set(spoilerfreq).intersection(set(nonspoilerfreq))

    for u in sorted(onlyspoiler, key=lambda t: spoilerfreq[t], reverse=True)[:n]: print('- {0:35} {1}'.format(u, spoilerfreq[u]))

    print()
    print(f'{desc.capitalize()} only in Non-Spoiler Reviews:')
    print()

    for u in sorted(onlynonspoiler, key=lambda t: nonspoilerfreq[t], reverse=True)[:n]: print('- {0:35} {1}'.format(u, nonspoilerfreq[u]))

    print()
    print(f'{desc.capitalize()} common in Spoiler and Non spoiler reviews:')
    print()

    print('{0:37} {1:10} {2:10}'.format('Token', 'Spoiler', 'Non Spoiler'))
    print('------------------------------------------------------------')

    for u in sorted(common, key=lambda t: spoilerfreq[t] + nonspoilerfreq[t], reverse=True)[:n]:

        print('- {0:35} {1:<10} {2:<10}'.format(u, spoilerfreq[u], nonspoilerfreq[u]))

In [56]:
comparison_info(
  lemmas_spoiler_text_clean_tokens,
  lemmas_non_spoiler_text_clean_tokens,
  desc = 'Lemmas',
  n    = 20
)
comparison_info(
  lemmas_spoiler_summary_clean_tokens,
  lemmas_non_spoiler_summary_clean_tokens,
  desc = 'Lemmas',
  n    = 20
)

Lemmas only in Spoiler reviews:

- sorbonne                            32
- poffy                               25
- mink                                24
- dauphine                            23
- 'vertigo                            21
- incriminating                       21
- arendelle                           20
- yvelines                            20
- sylvie                              20
- ludlow                              20
- sunnyside                           19
- sulaco                              19
- danni                               19
- orbiting                            18
- direction-set                       18
- vivan                               18
- payload                             17
- dashwood                            17
- facehugger                          17
- murron                              17

Lemmas only in Non-Spoiler Reviews:

- cthanks                             17
- lithuania                           16
- -chris                   

Comment: mallon ta results pano ennen toso useful : perhaps removing ALL common words in both spoiler and non spoiler is a bit harsh and removes way too many words/ much info. 

Another approach we can follow is:, we remove the top 10 common in both lemmas (ie movie, a, film, the, i, good, great etc. since they dont give much information on whether a review contains a spoiler or not. 

After doing so we can check the results and filter more words out if needed.