# Detection of Movie Spoilers in Reviews

### ***Import essential libraries***

In [None]:
import pandas as pd
import numpy as np
import re, string, nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import itertools
import spacy
from collections import Counter
from nltk.stem import WordNetLemmatizer


nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download("stopwords")
nltk.download('punkt')
nltk.download('punkt_tab')


### Understanding the data

Apories:
1. what is rating_x, rating_y IMDB???
2. maybe convert duration to minutes would be better to compare

In [5]:
# Load the dataset
original_data = pd.read_csv('movie_spoiler_sample.csv')

In [6]:
# First view of the dataset
original_data.head(5)

Unnamed: 0,movie_id,plot_summary,duration,genre,rating_x,release_date,plot_synopsis,review_date,user_id,is_spoiler,review_text,rating_y,review_summary
0,tt0318649,Master explorer and former US Navy Seal Dirk P...,2h 4min,"['Action', 'Adventure', 'Comedy']",6.0,2005-04-08,The film begins with a prologue set in Richmon...,27 May 2005,ur3270789,False,The film starts in the Richmond battle (1865) ...,7,Splendid adventure film with mesmerizing deser...
1,tt0803096,When the world of the Orcs of Draenor is being...,2h 3min,"['Action', 'Adventure', 'Fantasy']",6.9,2016-06-10,"For ages in the region of Middle Earth, humans...",20 June 2016,ur47788388,True,I saw the movie with six friends and we all lo...,10,Epic movie for fans and non fans
2,tt0425112,"Top London cop, PC Nicholas Angel is good. Too...",2h 1min,"['Action', 'Comedy']",7.9,2007-04-20,Nicholas Angel (Simon Pegg) is undoubtedly Lon...,11 June 2007,ur14440242,True,"I enjoyed this movie very much, but it being l...",8,once again a funny British film
3,tt0327056,In the summer of 1975 in a neighborhood in Bos...,2h 18min,"['Crime', 'Drama', 'Mystery']",8.0,2003-10-15,"Three young boys, all the best of friends, are...",5 November 2015,ur63623011,False,What you get here is no more than Clint eastwo...,1,For he made a 'plot twist' out of it
4,tt0259711,"Incarcerated and charged with murder, David Aa...",2h 16min,"['Fantasy', 'Mystery', 'Romance']",6.9,2001-12-14,"David Aames (Tom Cruise) drives to work, he fi...",12 August 2016,ur2781970,False,"I think the first time I saw this movie, I did...",10,As brilliant as I recalled it!


The dataset is very large and run time error occurs therefore we decided to use undersampling to obtain a balanced sample of the data.

In [8]:
# Check the count per movie id
original_data['movie_id'].value_counts()

movie_id
tt0468569    643
tt0111161    565
tt0167260    328
tt0137523    272
tt0110912    265
            ... 
tt0383534      3
tt0110989      2
tt5580036      2
tt0107719      1
tt0374887      1
Name: count, Length: 1339, dtype: int64

In [9]:
# Check how many unique movies we have
len(original_data['movie_id'].value_counts())

1339

For the above reason we decided to take a sample of at most 4 reviews per movie (or less if they don't have enough reviews).

In [11]:
# Sample each movie's reviews based on the given criteria
sampled_data = original_data.groupby('movie_id', group_keys=False, as_index=False).apply(
    lambda movie_df: movie_df if len(movie_df) <= 4 else pd.concat([
        movie_df[movie_df['is_spoiler'] == True].sample(n=min(2, len(movie_df[movie_df['is_spoiler'] == True])), random_state=42),
        movie_df[movie_df['is_spoiler'] == False].sample(n=min(2, len(movie_df[movie_df['is_spoiler'] == False])), random_state=42)
    ])
).reset_index(drop=True)

sampled_data.head()

Unnamed: 0,movie_id,plot_summary,duration,genre,rating_x,release_date,plot_synopsis,review_date,user_id,is_spoiler,review_text,rating_y,review_summary
0,tt0015864,A lone prospector ventures into Alaska looking...,1h 35min,"['Adventure', 'Comedy', 'Drama']",8.2,1925,It is in the middle of the Gold Rush. A Lone P...,1 February 2006,ur5945598,True,Oh where can I start on why alleged comedians ...,10,Chaplin strikes comic gold!
1,tt0015864,A lone prospector ventures into Alaska looking...,1h 35min,"['Adventure', 'Comedy', 'Drama']",8.2,1925,It is in the middle of the Gold Rush. A Lone P...,11 August 2005,ur5805910,True,"We follow ""the little fellow"" (Chaplin), in hi...",10,Chaplin's delightful bonhomie & innocence
2,tt0015864,A lone prospector ventures into Alaska looking...,1h 35min,"['Adventure', 'Comedy', 'Drama']",8.2,1925,It is in the middle of the Gold Rush. A Lone P...,22 June 2000,ur0773000,False,I recently saw this movie with a live orchestr...,9,"As fresh, funny, and moving as the day it was ..."
3,tt0015864,A lone prospector ventures into Alaska looking...,1h 35min,"['Adventure', 'Comedy', 'Drama']",8.2,1925,It is in the middle of the Gold Rush. A Lone P...,9 March 2009,ur0361658,False,The Gold Rush (1925) was a big undertaking for...,10,The Cinema of Charles Chaplin: The Gold Rush
4,tt0017136,"Sometime in the future, the city of Metropolis...",2h 33min,"['Drama', 'Sci-Fi']",8.3,1927-03-13,"The film is set in the year 2026, in the extra...",7 March 2015,ur13977076,True,This sci-fi classic is set in a future where t...,10,A science fiction classic


In [12]:
# Number of records and features for the dataset
sampled_data.shape

(5335, 13)

In [13]:
# Check if the sample is balanced
sampled_data['is_spoiler'].value_counts()

is_spoiler
False    2677
True     2658
Name: count, dtype: int64

The target variable is balanced.

In [16]:
sampled_data['whole_review'] = sampled_data['review_text'] + sampled_data['review_summary']

In [17]:
# View the columns of the dataset
sampled_data.columns

Index(['movie_id', 'plot_summary', 'duration', 'genre', 'rating_x',
       'release_date', 'plot_synopsis', 'review_date', 'user_id', 'is_spoiler',
       'review_text', 'rating_y', 'review_summary', 'whole_review'],
      dtype='object')

In [18]:
sampled_data['whole_plot'] = sampled_data['plot_synopsis'] + sampled_data['plot_summary']

The target variable is balanced.

In [20]:
# Check for duplicated records
sampled_data.duplicated().any()

False

There are no duplicated records.

In [22]:
# Check the data types
sampled_data.dtypes

movie_id           object
plot_summary       object
duration           object
genre              object
rating_x          float64
release_date       object
plot_synopsis      object
review_date        object
user_id            object
is_spoiler           bool
review_text        object
rating_y            int64
review_summary     object
whole_review       object
whole_plot         object
dtype: object

In [23]:
# Save the sampled data as a csv
sampled_data.to_csv("sampled_data.csv", index=False)

### Named entity Recognition on the plot synopsis and review to find the main characters

In [32]:
#This must be run every time before NER, we may put this in the beginning of the notebook
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     -- ------------------------------------- 0.8/12.8 MB 2.8 MB/s eta 0:00:05
     ---- ----------------------------------- 1.6/12.8 MB 3.7 MB/s eta 0:00:04
     ------- -------------------------------- 2.4/12.8 MB 3.8 MB/s eta 0:00:03
     --------- ------------------------------ 2.9/12.8 MB 3.5 MB/s eta 0:00:03
     ---------- ----------------------------- 3.4/12.8 MB 3.1 MB/s eta 0:00:03
     ------------ --------------------------- 3.9/12.8 MB 3.1 MB/s eta 0:00:03
     -------------- ------------------------- 4.7/12.8 MB 3.1 MB/s eta 0:00:03
     ---------------- ----------------------- 5.2/12.8 MB 3.1 MB/s eta 0:00:03
     ----------------- ---------------------- 5.

In [34]:

# Load spaCy model
nlp = spacy.load("en_core_web_sm")  

# Group reviews by movie_id
grouped_reviews = sampled_data.groupby("movie_id")["plot_summary"].apply(lambda x: " ".join(x))

# Dictionary to store PERSON entity counts per movie
person_entities_by_movie = {}

# Process each movie's reviews separately
for movie_id, text in grouped_reviews.items():
    doc = nlp(text)  # Apply spaCy NER directly without tokenization
    person_entities = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]  # Extract only PERSON entities
    
    # Count occurrences of each PERSON entity
    person_counts = Counter(person_entities)
    
    # Store results
    person_entities_by_movie[movie_id] = person_counts

# Print the most common PERSON entities per movie
for movie_id, person_counts in person_entities_by_movie.items():
    print(f"Movie ID: {movie_id} - Most Common PERSON Entities:")
    for entity, count in person_counts.most_common(5):  # Top 5 most common PERSON names
        print(f"{entity}: {count} times")
    print("\n" + "="*40 + "\n")  # Separator for readability


Movie ID: tt0015864 - Most Common PERSON Entities:
John J. Magee: 4 times


Movie ID: tt0017136 - Most Common PERSON Entities:
Metropolis: 8 times
Maria: 8 times
Freder Fredersen: 4 times
Freder: 4 times
John Fredersen: 4 times


Movie ID: tt0017925 - Most Common PERSON Entities:
Johnnie: 8 times
Annabelle Lee: 4 times
Ed Stephan: 4 times


Movie ID: tt0018455 - Most Common PERSON Entities:
Indre: 4 times


Movie ID: tt0019254 - Most Common PERSON Entities:
Jeanne D'Arc: 4 times
Jeanne: 4 times


Movie ID: tt0021749 - Most Common PERSON Entities:
John J. Magee: 4 times


Movie ID: tt0022100 - Most Common PERSON Entities:
Hans Beckert: 4 times
Hans: 4 times
Claudio Carvalho: 4 times


Movie ID: tt0025316 - Most Common PERSON Entities:
King Westley: 4 times
King: 4 times
Peter Warne: 4 times
Warne: 4 times
Peter: 4 times


Movie ID: tt0031381 - Most Common PERSON Entities:
Scarlett: 16 times
Ashley: 8 times
Melanie: 8 times
Tara: 4 times
Mammy: 4 times


Movie ID: tt0031679 - Most Common

##### The small model gives accurate results but we may also use the large one in a similar manner - for runtime purposes I will not run the following cells.

In [None]:
#!python -m spacy download en_core_web_lg


In [None]:
'''
# Load spaCy model
nlp = spacy.load("en_core_web_lg")  # Changed to a larger model for better accuracy

# Group reviews by movie_id
grouped_reviews = sampled_data.groupby("movie_id")["plot_summary"].apply(lambda x: " ".join(x))

# Dictionary to store PERSON entity counts per movie
person_entities_by_movie = {}

# Process each movie's reviews separately
for movie_id, text in grouped_reviews.items():
    doc = nlp(text)  # Apply spaCy NER directly without tokenization
    person_entities = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]  # Extract only PERSON entities
    
    # Count occurrences of each PERSON entity
    person_counts = Counter(person_entities)
    
    # Store results
    person_entities_by_movie[movie_id] = person_counts

# Print the most common PERSON entities per movie
for movie_id, person_counts in person_entities_by_movie.items():
    print(f"Movie ID: {movie_id} - Most Common PERSON Entities:")
    for entity, count in person_counts.most_common(5):  # Top 5 most common PERSON names
        print(f"{entity}: {count} times")
    print("\n" + "="*40 + "\n")  # Separator for readability
'''

In [None]:
#remove starting and trailing white spaces from strings
columns_to_strip = ['movie_id','plot_summary','duration','genre','release_date','plot_synopsis',
                            'review_date','user_id','review_text','review_summary']
sampled_data[columns_to_strip] = original_data[columns_to_strip].apply(lambda col: col.str.strip())

In [None]:
# Check for missing values
sampled_data.isna().any()

There are no missing values.

In [None]:
# Check for empty strings
(sampled_data == '').sum()

There are no empty strings.

In [None]:
genre = sampled_data['genre']
genre

In [None]:
# Check if every element of genre feature is a string in order to make them lists
sampled_data['genre'].apply(lambda x: isinstance(x, str)).all()

In [None]:
# split the genres and remove the "[" "]" characters
sampled_data["genre"] = sampled_data["genre"].str.split(",")

sampled_data_exploded = sampled_data.explode('genre')
sampled_data_exploded['genre'] = sampled_data_exploded['genre'].str.replace(r"[\[\]' ]", "", regex=True)

# Apply one-hot encoding using get_dummies
sampled_data_encoded = pd.get_dummies(sampled_data_exploded['genre'])

# merge with the original dataframe
sampled_data_final = sampled_data.merge(sampled_data_encoded.groupby(sampled_data_exploded.index).sum(), left_index= True, right_index = True)

#  Drop the original 'genre' column if it's no longer needed
sampled_data_final = sampled_data_final.drop(columns=['genre'])
sampled_data = sampled_data_final

sampled_data

In [None]:
sampled_data.columns

The number of records does not change.

In [None]:
# Convert duration to minutes

def convert_to_minutes(duration):
    parts = duration.replace("h", "").replace("min", "").strip().split()
    
    hours = int(parts[0]) if "h" in duration else 0
    minutes = int(parts[1]) if len(parts) > 1 else 0
    
    return hours * 60 + minutes

sampled_data['duration_minutes'] = sampled_data['duration'].apply(convert_to_minutes)
sampled_data

In [None]:
sampled_data.columns

In [None]:
# Rename rating_x, rating_y
sampled_data.rename(columns={'rating_x': 'IMDB_rating', 'rating_y': 'user_rating'}, inplace=True)

In [None]:
# Create new features that only have the year of release and the year of the review

sampled_data['release_year'] = sampled_data['release_date'].str[:4]
sampled_data['review_year'] = sampled_data['review_date'].str[-4:]

sampled_data

In [None]:
# Convert True to 1 and False to 0 in the response variable
sampled_data['is_spoiler'] = sampled_data['is_spoiler'].astype(int)
sampled_data['is_spoiler']

### Text Pre-processing

In [None]:
# Create new dataframe for spoiler and not spoiler class
df_spoiler = sampled_data[sampled_data['is_spoiler'] == 1]
df_not_spoiler = sampled_data[sampled_data['is_spoiler'] == 0]

#### Uncontract

In [27]:
# Define the uncontract function which converts short forms to the full word. For example: 'isn't' to 'is not'.
def uncontract(text):
    text = re.sub(r"(\b)([Aa]re|[Cc]ould|[Dd]id|[Dd]oes|[Dd]o|[Hh]ad|[Hh]as|[Hh]ave|[Ii]s|[Mm]ight|[Mm]ust|[Ss]hould|[Ww]ere|[Ww]ould)n't", r"\1\2 not", text)
    text = re.sub(r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'ll", r"\1\2 will", text)
    text = re.sub(r"(\b)([Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'re", r"\1\2 are", text)
    text = re.sub(r"(\b)([Ii]|[Ss]hould|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Ww]ould|[Yy]ou)'ve", r"\1\2 have", text)

    text = re.sub(r"(\b)([Cc]a)n't", r"\1\2n not", text)
    text = re.sub(r"(\b)([Ii])'m", r"\1\2 am", text)
    text = re.sub(r"(\b)([Ll]et)'s", r"\1\2 us", text)
    text = re.sub(r"(\b)([Tt]here)'s", r"\1\2 is", text)
    text = re.sub(r"(\b)([Ww])on't", r"\1\2ill not", text)
    text = re.sub(r"(\b)([Ss])han't", r"\1\2hall not", text)
    text = re.sub(r"(\b)([Yy])(?:'all|a'll)", r"\1\2ou all", text)

    return text

In [28]:
sampled_data['review_text'] = [uncontract(t) for t in sampled_data['review_text']]
sampled_data['review_summary'] = [uncontract(t) for t in sampled_data['review_summary']]

In [29]:
# Create new dataframe for spoiler and not spoiler class
df_spoiler = sampled_data[sampled_data['is_spoiler'] == 1]
df_not_spoiler = sampled_data[sampled_data['is_spoiler'] == 0]

#### Tokenization

In [30]:
spoiler_tokens = [word_tokenize(t) for t in df_spoiler['whole_review']]

not_spoiler_tokens = [word_tokenize(t) for t in df_not_spoiler['whole_review']]

In [31]:
# Convert to a single list
spoiler_tokens = list(itertools.chain.from_iterable(spoiler_tokens))

not_spoiler_tokens = list(itertools.chain.from_iterable(not_spoiler_tokens))

In [32]:
# Check the number of unique tokens
print('Number of Unique Spoiler Tokens:', len(set(spoiler_tokens)))
print('Number of Unique Non-spoiler Tokens:', len(set(not_spoiler_tokens)))

Number of Unique Spoiler Tokens: 50505
Number of Unique Non-spoiler Tokens: 40262


In [33]:
# Check the most frequent tokens for Spoilers and Non-spoilers
print('Spoiler Tokens:\n')

for t, f in Counter(spoiler_tokens).most_common(10):

    print('{0:25} {1}'.format(t, f))

print()
#------------------------------------
print('Non Spoiler Tokens:\n')

for t, f in Counter(not_spoiler_tokens).most_common(10):

    print('{0:25} {1}'.format(t, f))

print()

Spoiler Tokens:

the                       46382
,                         42739
.                         34238
and                       25402
a                         23422
to                        21460
of                        21416
is                        17486
in                        13383
that                      11254

Non Spoiler Tokens:

the                       32025
,                         30217
.                         25088
and                       18750
a                         16877
of                        15589
to                        14373
is                        13083
I                         9328
in                        9128



In [34]:
def dimensionality_info(spoilertext ,nonspoilertext):

    print('Number of Spoiler Tokens:    ', len(spoilertext),     '/', len(spoiler_tokens))
    print('Number of Non Spoiler Tokens:', len(nonspoilertext), '/', len(not_spoiler_tokens))

    print()

    print('Number of Unique Spoiler Tokens:    ', len(set(spoilertext)),     '/', len(spoiler_tokens))
    print('Number of Unique Non Spoiler Tokens:', len(set(nonspoilertext)), '/', len(not_spoiler_tokens))

    print()
    print('|V|/|Tokens|:            ', len(set(spoilertext)) / len(set(spoiler_tokens)))
    print('|V|/|Tokens|:            ', len(set(nonspoilertext)) / len(set(not_spoiler_tokens)))

We observe that the most frequent tokens are stopwords. For that reason we will remvove them to get more informative result.

#### Remove Punctuation and Stopwords

##### Remove Punctuation

In [35]:
spoiler_punctuations = list(
    filter(
        lambda t: all(c.isdigit() or c in string.punctuation for c in t),
        spoiler_tokens
    )
)
#--------------------------------------
non_spoiler_punctuations = list(
    filter(
        lambda t: all(c.isdigit() or c in string.punctuation for c in t),
        not_spoiler_tokens
    )
)

In [36]:
spoiler_punctuations = set(spoiler_punctuations)

non_spoiler_punctuations = set(non_spoiler_punctuations)

In [37]:
# Remove punctuation
spoiler_clean_tokens = [t for t in spoiler_tokens if t not in spoiler_punctuations]

non_spoiler_clean_tokens = [t for t in not_spoiler_tokens if t not in non_spoiler_punctuations]

##### Remove Stopwords

In [38]:
stop_word_regex = '|'.join(['^{}$'.format(s) for s in stopwords.words('english')])

stop_word_regex = re.compile(stop_word_regex)

In [39]:
spoiler_stop_words = list(filter(stop_word_regex.match, spoiler_clean_tokens))

non_spoiler_stop_words = list(filter(stop_word_regex.match, non_spoiler_clean_tokens))
#----------------------------------------------------
spoiler_stop_words = set(spoiler_stop_words)

non_spoiler_stop_words = set(non_spoiler_stop_words)

In [40]:
# Remove stopwords
spoiler_clean_tokens = [t for t in spoiler_clean_tokens if t not in spoiler_stop_words]

non_spoiler_clean_tokens = [t for t in non_spoiler_clean_tokens if t not in non_spoiler_stop_words]

Check the most frequent tokens after removing punctuation and stopwords.

In [41]:
# Check again the most frequent tokens for Spoilers and Non-spoilers
print('Clean Spoiler Tokens:\n')

for t, f in Counter(spoiler_clean_tokens).most_common(10):

    print('{0:25} {1}'.format(t, f))

print()
#------------------------------------
print('Clean Non Spoiler Tokens:\n')

for t, f in Counter(non_spoiler_clean_tokens).most_common(10):

    print('{0:25} {1}'.format(t, f))

print()

Clean Spoiler Tokens:

I                         10988
's                        9686
movie                     6808
The                       5916
film                      5624
n't                       4986
one                       3203
like                      2896
It                        2405
good                      2063

Clean Non Spoiler Tokens:

I                         9328
's                        6743
movie                     6358
film                      4835
The                       4344
n't                       3787
one                       2588
like                      2148
It                        2076
good                      1843



We can observe that the possesive s does not give much information. Therefore we decided to consider it as a stopword and remove it.

In [42]:
spoiler_clean_tokens = [t for t in spoiler_clean_tokens if t != "'s"]

non_spoiler_clean_tokens = [t for t in non_spoiler_clean_tokens if t != "'s"]

#### Normalization

##### Lowercasing

In [43]:
spoiler_clean_tokens     = [t.lower() for t in spoiler_clean_tokens]

non_spoiler_clean_tokens     = [t.lower() for t in non_spoiler_clean_tokens]

##### Stemming

In [44]:
ps = PorterStemmer()

stems_spoiler = [ps.stem(t) for t in spoiler_clean_tokens]
stems_non_spoiler = [ps.stem(t) for t in non_spoiler_clean_tokens]

In [45]:
print('Spoiler Stems:\n')

for t, f in Counter(stems_spoiler).most_common(10):
    print('{0:25} {1}'.format(t, f))

print()
#-----------------------------
print('Non Spoiler Stems:\n')

for t, f in Counter(stems_non_spoiler).most_common(10):
    print('{0:25} {1}'.format(t, f))

print()

Spoiler Stems:

i                         10989
movi                      8033
film                      6683
the                       6046
n't                       5004
one                       3643
like                      3454
charact                   2712
it                        2548
time                      2453

Non Spoiler Stems:

i                         9329
movi                      7570
film                      5814
the                       4448
n't                       3799
one                       2917
like                      2581
it                        2175
charact                   2047
good                      1966



In [46]:
dimensionality_info(stems_spoiler, stems_non_spoiler)

Number of Spoiler Tokens:     506174 / 1021153
Number of Non Spoiler Tokens: 364236 / 730968

Number of Unique Spoiler Tokens:     33314 / 1021153
Number of Unique Non Spoiler Tokens: 26725 / 730968

|V|/|Tokens|:             0.6596178596178596
|V|/|Tokens|:             0.6637772589538523


##### Lemmatizing

In [47]:
lemmatizer = WordNetLemmatizer()

lemmas_spoiler = [lemmatizer.lemmatize(t) for t in spoiler_clean_tokens]
lemmas_non_spoiler = [lemmatizer.lemmatize(t) for t in non_spoiler_clean_tokens]

In [49]:
print('Spoiler Lemmas:\n')

for t, f in Counter(lemmas_spoiler).most_common(10):
    print('{0:25} {1}'.format(t, f))

print()
#-----------------------------
print('Non Spoiler Lemmas:\n')

for t, f in Counter(lemmas_non_spoiler).most_common(10):
    print('{0:25} {1}'.format(t, f))

print()

Spoiler Lemmas:

i                         10988
movie                     8033
film                      6599
the                       6046
n't                       5004
one                       3643
like                      3082
character                 2712
it                        2548
time                      2428

Non Spoiler Lemmas:

i                         9328
movie                     7570
film                      5718
the                       4448
n't                       3799
one                       2917
like                      2290
it                        2175
character                 2047
good                      1952



In [51]:
dimensionality_info(lemmas_spoiler, lemmas_non_spoiler)

Number of Spoiler Tokens:     506174 / 1021153
Number of Non Spoiler Tokens: 364236 / 730968

Number of Unique Spoiler Tokens:     40430 / 1021153
Number of Unique Non Spoiler Tokens: 32250 / 730968

|V|/|Tokens|:             0.8005148005148005
|V|/|Tokens|:             0.8010034275495505


In [None]:
#Many of the spoiler and non-spoiler most common words are the same - therefore we can try to isolate the 
#most common words that appear *only* in spoiler reviews and *only* in negative reviews

In [52]:
##############################################
# Define an informative function to view the #
# information of frequencies between the two #
# sets of tokens passed as input.            #
##############################################

def comparison_info(spoiler, nonspoiler, desc=None, n=10):

    print(f'{desc.capitalize()} only in Spoiler reviews:')
    print()

    spoilerfreq = dict(Counter(spoiler))
    nonspoilerfreq = dict(Counter(nonspoiler))

    onlyspoiler = set(spoilerfreq).difference(set(nonspoilerfreq))
    onlynonspoiler = set(nonspoilerfreq).difference(set(spoilerfreq))
    common   = set(spoilerfreq).intersection(set(nonspoilerfreq))

    for u in sorted(onlyspoiler, key=lambda t: spoilerfreq[t], reverse=True)[:n]: print('- {0:35} {1}'.format(u, spoilerfreq[u]))

    print()
    print(f'{desc.capitalize()} only in Non-Spoiler Reviews:')
    print()

    for u in sorted(onlynonspoiler, key=lambda t: nonspoilerfreq[t], reverse=True)[:n]: print('- {0:35} {1}'.format(u, nonspoilerfreq[u]))

    print()
    print(f'{desc.capitalize()} common in Spoiler and Non spoiler reviews:')
    print()

    print('{0:37} {1:10} {2:10}'.format('Token', 'Spoiler', 'Non Spoiler'))
    print('------------------------------------------------------------')

    for u in sorted(common, key=lambda t: spoilerfreq[t] + nonspoilerfreq[t], reverse=True)[:n]:

        print('- {0:35} {1:<10} {2:<10}'.format(u, spoilerfreq[u], nonspoilerfreq[u]))

In [53]:
comparison_info(
  lemmas_spoiler,
  lemmas_non_spoiler,
  desc = 'Lemmas',
  n    = 10
)

Lemmas only in Spoiler reviews:

- dir                                 30
- edmond                              26
- ant                                 25
- sheeta                              21
- pazu                                19
- backdraft                           19
- atticus                             18
- susanna                             17
- nzt                                 16
- alvy                                16

Lemmas only in Non-Spoiler Reviews:

- algren                              15
- spiderwick                          12
- harding                             11
- ivy                                 10
- shu                                 10
- percy                               10
- feig                                9
- drillbit                            9
- katsumoto                           9
- tadashi                             9

Lemmas common in Spoiler and Non spoiler reviews:

Token                                 Spoiler    Non Spoiler


Comment: mallon ta results pano ennen toso useful : perhaps removing ALL common words in both spoiler and non spoiler is a bit harsh and removes way too many words/ much info. 

Another approach we can follow is:, we remove the top 10 common in both lemmas (ie movie, a, film, the, i, good, great etc. since they dont give much information on whether a review contains a spoiler or not. 

After doing so we can check the results and filter more words out if needed.