# Detection of Movie Spoilers in Reviews

### ***Import essential libraries***

In [1]:
import pandas as pd
import numpy as np
import re, string, nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import itertools
from collections import Counter

nltk.download("stopwords")
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\andri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\andri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### Understanding the data

Apories:
1. what is rating_x, rating_y IMDB???
2. maybe convert duration to minutes would be better to compare

In [2]:
# Load the dataset
original_data = pd.read_csv('movie_spoiler_sample.csv')

In [3]:
# First view of the dataset
original_data.head(5)

Unnamed: 0,movie_id,plot_summary,duration,genre,rating_x,release_date,plot_synopsis,review_date,user_id,is_spoiler,review_text,rating_y,review_summary
0,tt0318649,Master explorer and former US Navy Seal Dirk P...,2h 4min,"['Action', 'Adventure', 'Comedy']",6.0,2005-04-08,The film begins with a prologue set in Richmon...,27 May 2005,ur3270789,False,The film starts in the Richmond battle (1865) ...,7,Splendid adventure film with mesmerizing deser...
1,tt0803096,When the world of the Orcs of Draenor is being...,2h 3min,"['Action', 'Adventure', 'Fantasy']",6.9,2016-06-10,"For ages in the region of Middle Earth, humans...",20 June 2016,ur47788388,True,I saw the movie with six friends and we all lo...,10,Epic movie for fans and non fans
2,tt0425112,"Top London cop, PC Nicholas Angel is good. Too...",2h 1min,"['Action', 'Comedy']",7.9,2007-04-20,Nicholas Angel (Simon Pegg) is undoubtedly Lon...,11 June 2007,ur14440242,True,"I enjoyed this movie very much, but it being l...",8,once again a funny British film
3,tt0327056,In the summer of 1975 in a neighborhood in Bos...,2h 18min,"['Crime', 'Drama', 'Mystery']",8.0,2003-10-15,"Three young boys, all the best of friends, are...",5 November 2015,ur63623011,False,What you get here is no more than Clint eastwo...,1,For he made a 'plot twist' out of it
4,tt0259711,"Incarcerated and charged with murder, David Aa...",2h 16min,"['Fantasy', 'Mystery', 'Romance']",6.9,2001-12-14,"David Aames (Tom Cruise) drives to work, he fi...",12 August 2016,ur2781970,False,"I think the first time I saw this movie, I did...",10,As brilliant as I recalled it!


In [4]:
# View the columns of the dataset
original_data.columns

Index(['movie_id', 'plot_summary', 'duration', 'genre', 'rating_x',
       'release_date', 'plot_synopsis', 'review_date', 'user_id', 'is_spoiler',
       'review_text', 'rating_y', 'review_summary'],
      dtype='object')

In [5]:
# Number of records and features for the dataset
original_data.shape

(72054, 13)

In [6]:
# Check whether the response is balanced
original_data['is_spoiler'].value_counts()

is_spoiler
False    36277
True     35777
Name: count, dtype: int64

The target variable is balanced.

In [7]:
# Check for duplicated records
original_data.duplicated().any()

False

There are no duplicated records.

In [8]:
# Check the data types
original_data.dtypes

movie_id           object
plot_summary       object
duration           object
genre              object
rating_x          float64
release_date       object
plot_synopsis      object
review_date        object
user_id            object
is_spoiler           bool
review_text        object
rating_y            int64
review_summary     object
dtype: object

In [9]:
#remove starting and trailing white spaces from strings
columns_to_strip = ['movie_id','plot_summary','duration','genre','release_date','plot_synopsis',
                            'review_date','user_id','review_text','review_summary']
original_data[columns_to_strip] = original_data[columns_to_strip].apply(lambda col: col.str.strip())

In [10]:
# Check for missing values
original_data.isna().any()

movie_id          False
plot_summary      False
duration          False
genre             False
rating_x          False
release_date      False
plot_synopsis     False
review_date       False
user_id           False
is_spoiler        False
review_text       False
rating_y          False
review_summary    False
dtype: bool

There are no missing values.

In [11]:
# Check for empty strings
(original_data == '').sum()

movie_id          0
plot_summary      0
duration          0
genre             0
rating_x          0
release_date      0
plot_synopsis     0
review_date       0
user_id           0
is_spoiler        0
review_text       0
rating_y          0
review_summary    0
dtype: int64

There are no empty strings.

In [12]:
#replace empty strings with NA
# original_data[columns_to_strip] = original_data[columns_to_strip].replace('', np.nan)

In [13]:
genre = original_data['genre']
genre

0         ['Action', 'Adventure', 'Comedy']
1        ['Action', 'Adventure', 'Fantasy']
2                      ['Action', 'Comedy']
3             ['Crime', 'Drama', 'Mystery']
4         ['Fantasy', 'Mystery', 'Romance']
                        ...                
72049                            ['Comedy']
72050                 ['Comedy', 'Fantasy']
72051     ['Biography', 'Drama', 'History']
72052     ['Adventure', 'Comedy', 'Family']
72053     ['Action', 'Adventure', 'Sci-Fi']
Name: genre, Length: 72054, dtype: object

In [14]:
# Check if every element of genre feature is a string in order to make them lists
original_data['genre'].apply(lambda x: isinstance(x, str)).all()

True

In [15]:
# split the genres and remove the "[" "]" characters
original_data["genre"] = original_data["genre"].str.split(",")

original_data_exploded = original_data.explode('genre')
original_data_exploded['genre'] = original_data_exploded['genre'].str.replace(r"[\[\]' ]", "", regex=True)

# Apply one-hot encoding using get_dummies
original_data_encoded = pd.get_dummies(original_data_exploded['genre'])

# merge with the original dataframe
original_data_final = original_data.merge(original_data_encoded.groupby(original_data_exploded.index).sum(), left_index= True, right_index = True)

#  Drop the original 'genre' column if it's no longer needed
original_data_final = original_data_final.drop(columns=['genre'])
original_data =original_data_final

original_data

Unnamed: 0,movie_id,plot_summary,duration,rating_x,release_date,plot_synopsis,review_date,user_id,is_spoiler,review_text,...,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,tt0318649,Master explorer and former US Navy Seal Dirk P...,2h 4min,6.0,2005-04-08,The film begins with a prologue set in Richmon...,27 May 2005,ur3270789,False,The film starts in the Richmond battle (1865) ...,...,0,0,0,0,0,0,0,0,0,0
1,tt0803096,When the world of the Orcs of Draenor is being...,2h 3min,6.9,2016-06-10,"For ages in the region of Middle Earth, humans...",20 June 2016,ur47788388,True,I saw the movie with six friends and we all lo...,...,0,0,0,0,0,0,0,0,0,0
2,tt0425112,"Top London cop, PC Nicholas Angel is good. Too...",2h 1min,7.9,2007-04-20,Nicholas Angel (Simon Pegg) is undoubtedly Lon...,11 June 2007,ur14440242,True,"I enjoyed this movie very much, but it being l...",...,0,0,0,0,0,0,0,0,0,0
3,tt0327056,In the summer of 1975 in a neighborhood in Bos...,2h 18min,8.0,2003-10-15,"Three young boys, all the best of friends, are...",5 November 2015,ur63623011,False,What you get here is no more than Clint eastwo...,...,0,0,0,1,0,0,0,0,0,0
4,tt0259711,"Incarcerated and charged with murder, David Aa...",2h 16min,6.9,2001-12-14,"David Aames (Tom Cruise) drives to work, he fi...",12 August 2016,ur2781970,False,"I think the first time I saw this movie, I did...",...,0,0,0,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72049,tt0829482,"Seth and Evan are best friends, inseparable, n...",1h 53min,7.6,2007-08-17,Seth (Jonah Hill) and Evan (Michael Cera) are ...,18 February 2012,ur31632708,False,I had a very unpleasant experience watching Su...,...,0,0,0,0,0,0,0,0,0,0
72050,tt0101272,The Addams step out of Charles Addams' cartoon...,1h 39min,6.8,1991-11-22,Gomez Addams (Raúl Juliá) laments the 25-year ...,18 August 2007,ur13887584,True,"""The Addams Family"", while being entertaining,...",...,0,0,0,0,0,0,0,0,0,0
72051,tt0338751,"Biopic of billionaire Howard Hughes, starting ...",2h 50min,7.5,2004-12-25,The Aviator has no opening credits other than ...,27 January 2005,ur0180277,False,The story was interesting and the cinematograp...,...,0,0,0,0,0,0,0,0,0,0
72052,tt0367594,When Willy Wonka decides to let five children ...,1h 55min,6.7,2005-07-15,Willy Wonka (Johnny Depp) has built the greate...,31 August 2010,ur2532491,True,"To my way of thinking, if you're going to rema...",...,0,0,0,0,0,0,0,0,0,0


In [16]:
original_data.columns

Index(['movie_id', 'plot_summary', 'duration', 'rating_x', 'release_date',
       'plot_synopsis', 'review_date', 'user_id', 'is_spoiler', 'review_text',
       'rating_y', 'review_summary', 'Action', 'Adventure', 'Animation',
       'Biography', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy',
       'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery',
       'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western'],
      dtype='object')

The number of records does not change.

In [17]:
# Convert duration to minutes

def convert_to_minutes(duration):
    parts = duration.replace("h", "").replace("min", "").strip().split()
    
    hours = int(parts[0]) if "h" in duration else 0
    minutes = int(parts[1]) if len(parts) > 1 else 0
    
    return hours * 60 + minutes

original_data['duration_minutes'] = original_data['duration'].apply(convert_to_minutes)
original_data

Unnamed: 0,movie_id,plot_summary,duration,rating_x,release_date,plot_synopsis,review_date,user_id,is_spoiler,review_text,...,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western,duration_minutes
0,tt0318649,Master explorer and former US Navy Seal Dirk P...,2h 4min,6.0,2005-04-08,The film begins with a prologue set in Richmon...,27 May 2005,ur3270789,False,The film starts in the Richmond battle (1865) ...,...,0,0,0,0,0,0,0,0,0,124
1,tt0803096,When the world of the Orcs of Draenor is being...,2h 3min,6.9,2016-06-10,"For ages in the region of Middle Earth, humans...",20 June 2016,ur47788388,True,I saw the movie with six friends and we all lo...,...,0,0,0,0,0,0,0,0,0,123
2,tt0425112,"Top London cop, PC Nicholas Angel is good. Too...",2h 1min,7.9,2007-04-20,Nicholas Angel (Simon Pegg) is undoubtedly Lon...,11 June 2007,ur14440242,True,"I enjoyed this movie very much, but it being l...",...,0,0,0,0,0,0,0,0,0,121
3,tt0327056,In the summer of 1975 in a neighborhood in Bos...,2h 18min,8.0,2003-10-15,"Three young boys, all the best of friends, are...",5 November 2015,ur63623011,False,What you get here is no more than Clint eastwo...,...,0,0,1,0,0,0,0,0,0,138
4,tt0259711,"Incarcerated and charged with murder, David Aa...",2h 16min,6.9,2001-12-14,"David Aames (Tom Cruise) drives to work, he fi...",12 August 2016,ur2781970,False,"I think the first time I saw this movie, I did...",...,0,0,1,1,0,0,0,0,0,136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72049,tt0829482,"Seth and Evan are best friends, inseparable, n...",1h 53min,7.6,2007-08-17,Seth (Jonah Hill) and Evan (Michael Cera) are ...,18 February 2012,ur31632708,False,I had a very unpleasant experience watching Su...,...,0,0,0,0,0,0,0,0,0,113
72050,tt0101272,The Addams step out of Charles Addams' cartoon...,1h 39min,6.8,1991-11-22,Gomez Addams (Raúl Juliá) laments the 25-year ...,18 August 2007,ur13887584,True,"""The Addams Family"", while being entertaining,...",...,0,0,0,0,0,0,0,0,0,99
72051,tt0338751,"Biopic of billionaire Howard Hughes, starting ...",2h 50min,7.5,2004-12-25,The Aviator has no opening credits other than ...,27 January 2005,ur0180277,False,The story was interesting and the cinematograp...,...,0,0,0,0,0,0,0,0,0,170
72052,tt0367594,When Willy Wonka decides to let five children ...,1h 55min,6.7,2005-07-15,Willy Wonka (Johnny Depp) has built the greate...,31 August 2010,ur2532491,True,"To my way of thinking, if you're going to rema...",...,0,0,0,0,0,0,0,0,0,115


In [18]:
original_data.columns

Index(['movie_id', 'plot_summary', 'duration', 'rating_x', 'release_date',
       'plot_synopsis', 'review_date', 'user_id', 'is_spoiler', 'review_text',
       'rating_y', 'review_summary', 'Action', 'Adventure', 'Animation',
       'Biography', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy',
       'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery',
       'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western',
       'duration_minutes'],
      dtype='object')

In [19]:
# Rename rating_x, rating_y
original_data.rename(columns={'rating_x': 'IMDB_rating', 'rating_y': 'user_rating'}, inplace=True)

In [20]:
# Create new features that only have the year of release and the year of the review

original_data['release_year'] = original_data['release_date'].str[:4]
original_data['review_year'] = original_data['review_date'].str[-4:]

original_data

Unnamed: 0,movie_id,plot_summary,duration,IMDB_rating,release_date,plot_synopsis,review_date,user_id,is_spoiler,review_text,...,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western,duration_minutes,release_year,review_year
0,tt0318649,Master explorer and former US Navy Seal Dirk P...,2h 4min,6.0,2005-04-08,The film begins with a prologue set in Richmon...,27 May 2005,ur3270789,False,The film starts in the Richmond battle (1865) ...,...,0,0,0,0,0,0,0,124,2005,2005
1,tt0803096,When the world of the Orcs of Draenor is being...,2h 3min,6.9,2016-06-10,"For ages in the region of Middle Earth, humans...",20 June 2016,ur47788388,True,I saw the movie with six friends and we all lo...,...,0,0,0,0,0,0,0,123,2016,2016
2,tt0425112,"Top London cop, PC Nicholas Angel is good. Too...",2h 1min,7.9,2007-04-20,Nicholas Angel (Simon Pegg) is undoubtedly Lon...,11 June 2007,ur14440242,True,"I enjoyed this movie very much, but it being l...",...,0,0,0,0,0,0,0,121,2007,2007
3,tt0327056,In the summer of 1975 in a neighborhood in Bos...,2h 18min,8.0,2003-10-15,"Three young boys, all the best of friends, are...",5 November 2015,ur63623011,False,What you get here is no more than Clint eastwo...,...,1,0,0,0,0,0,0,138,2003,2015
4,tt0259711,"Incarcerated and charged with murder, David Aa...",2h 16min,6.9,2001-12-14,"David Aames (Tom Cruise) drives to work, he fi...",12 August 2016,ur2781970,False,"I think the first time I saw this movie, I did...",...,1,1,0,0,0,0,0,136,2001,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72049,tt0829482,"Seth and Evan are best friends, inseparable, n...",1h 53min,7.6,2007-08-17,Seth (Jonah Hill) and Evan (Michael Cera) are ...,18 February 2012,ur31632708,False,I had a very unpleasant experience watching Su...,...,0,0,0,0,0,0,0,113,2007,2012
72050,tt0101272,The Addams step out of Charles Addams' cartoon...,1h 39min,6.8,1991-11-22,Gomez Addams (Raúl Juliá) laments the 25-year ...,18 August 2007,ur13887584,True,"""The Addams Family"", while being entertaining,...",...,0,0,0,0,0,0,0,99,1991,2007
72051,tt0338751,"Biopic of billionaire Howard Hughes, starting ...",2h 50min,7.5,2004-12-25,The Aviator has no opening credits other than ...,27 January 2005,ur0180277,False,The story was interesting and the cinematograp...,...,0,0,0,0,0,0,0,170,2004,2005
72052,tt0367594,When Willy Wonka decides to let five children ...,1h 55min,6.7,2005-07-15,Willy Wonka (Johnny Depp) has built the greate...,31 August 2010,ur2532491,True,"To my way of thinking, if you're going to rema...",...,0,0,0,0,0,0,0,115,2005,2010


In [21]:
# Convert True to 1 and False to 0 in the response variable
original_data['is_spoiler'] = original_data['is_spoiler'].astype(int)
original_data['is_spoiler']

0        0
1        1
2        1
3        0
4        0
        ..
72049    0
72050    1
72051    0
72052    1
72053    1
Name: is_spoiler, Length: 72054, dtype: int32

### Text Pre-processing

In [22]:
# Create new dataframe for spoiler and not spoiler class
df_spoiler = original_data[original_data['is_spoiler'] == 1]
df_not_spoiler = original_data[original_data['is_spoiler'] == 0]

#### Uncontract

In [23]:
# Define the uncontract function which converts short forms to the full word. For example: 'isn't' to 'is not'.
def uncontract(text):
    text = re.sub(r"(\b)([Aa]re|[Cc]ould|[Dd]id|[Dd]oes|[Dd]o|[Hh]ad|[Hh]as|[Hh]ave|[Ii]s|[Mm]ight|[Mm]ust|[Ss]hould|[Ww]ere|[Ww]ould)n't", r"\1\2 not", text)
    text = re.sub(r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'ll", r"\1\2 will", text)
    text = re.sub(r"(\b)([Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'re", r"\1\2 are", text)
    text = re.sub(r"(\b)([Ii]|[Ss]hould|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Ww]ould|[Yy]ou)'ve", r"\1\2 have", text)

    text = re.sub(r"(\b)([Cc]a)n't", r"\1\2n not", text)
    text = re.sub(r"(\b)([Ii])'m", r"\1\2 am", text)
    text = re.sub(r"(\b)([Ll]et)'s", r"\1\2 us", text)
    text = re.sub(r"(\b)([Tt]here)'s", r"\1\2 is", text)
    text = re.sub(r"(\b)([Ww])on't", r"\1\2ill not", text)
    text = re.sub(r"(\b)([Ss])han't", r"\1\2hall not", text)
    text = re.sub(r"(\b)([Yy])(?:'all|a'll)", r"\1\2ou all", text)

    return text

In [24]:
original_data['review_text'] = [uncontract(t) for t in original_data['review_text']]
original_data['review_summary'] = [uncontract(t) for t in original_data['review_summary']]

In [25]:
# Create new dataframe for spoiler and not spoiler class
df_spoiler = original_data[original_data['is_spoiler'] == 1]
df_not_spoiler = original_data[original_data['is_spoiler'] == 0]

#### Tokenization

In [26]:
spoiler_tokens_text = [word_tokenize(t) for t in df_spoiler['review_text']]
spoiler_tokens_summary = [word_tokenize(t) for t in df_spoiler['review_summary']]

not_spoiler_tokens_text = [word_tokenize(t) for t in df_not_spoiler['review_text']]
not_spoiler_tokens_summary = [word_tokenize(t) for t in df_not_spoiler['review_summary']]

In [27]:
# Convert to a single list
spoiler_tokens_text = list(itertools.chain.from_iterable(spoiler_tokens_text))
spoiler_tokens_summary = list(itertools.chain.from_iterable(spoiler_tokens_summary))

not_spoiler_tokens_text = list(itertools.chain.from_iterable(not_spoiler_tokens_text))
not_spoiler_tokens_summary = list(itertools.chain.from_iterable(not_spoiler_tokens_summary))

spoiler_tokens = spoiler_tokens_text + spoiler_tokens_summary
not_spoiler_tokens = not_spoiler_tokens_text + not_spoiler_tokens_summary

In [28]:
# Check the number of unique tokens
print('Number of Unique Spoiler Tokens:', len(set(spoiler_tokens)))
print('Number of Unique Non-spoiler Tokens:', len(set(not_spoiler_tokens)))

Number of Unique Spoiler Tokens: 215849
Number of Unique Non-spoiler Tokens: 171285


In [29]:
# Check the most frequent tokens for Spoilers and Non-spoilers
print('Spoiler Tokens (full text):\n')

for t, f in Counter(spoiler_tokens_text).most_common(10):

  print('{0:25} {1}'.format(t, f))

print()
#------------------------------------
print('Spoiler Tokens (summary):\n')

for t, f in Counter(spoiler_tokens_summary).most_common(10):

  print('{0:25} {1}'.format(t, f))

print()
#------------------------------------
print('Non Spoiler Tokens (full text):\n')

for t, f in Counter(not_spoiler_tokens_text).most_common(10):

  print('{0:25} {1}'.format(t, f))

print()
#------------------------------------
print('Non Spoiler Tokens (summary):\n')

for t, f in Counter(not_spoiler_tokens_summary).most_common(10):

  print('{0:25} {1}'.format(t, f))

print()

Spoiler Tokens (full text):

the                       640924
,                         572117
.                         488513
and                       334458
a                         305482
of                        291265
to                        289788
is                        230213
in                        177901
I                         162801

Spoiler Tokens (summary):

,                         7585
the                       7166
!                         7162
.                         6725
of                        5517
a                         4871
and                       4112
movie                     3711
A                         3605
to                        3048

Non Spoiler Tokens (full text):

the                       429904
,                         401464
.                         359176
and                       243554
a                         220162
of                        210827
to                        192991
is                        172367
I    

We observe that the most frequent tokens are stopwords. For that reason we will remvove them to get more informative result.

#### Remove Punctuation and Stopwords

##### Remove Punctuation

In [30]:
spoiler_punctuations_text = list(
    filter(
        lambda t: all(c.isdigit() or c in string.punctuation for c in t),
        spoiler_tokens_text
    )
)
#--------------------------------------
spoiler_punctuations_summary = list(
    filter(
        lambda t: all(c.isdigit() or c in string.punctuation for c in t),
        spoiler_tokens_summary
    )
)
#--------------------------------------
non_spoiler_punctuations_text = list(
    filter(
        lambda t: all(c.isdigit() or c in string.punctuation for c in t),
        not_spoiler_tokens_text
    )
)
#--------------------------------------
non_spoiler_punctuations_summary = list(
    filter(
        lambda t: all(c.isdigit() or c in string.punctuation for c in t),
        not_spoiler_tokens_summary
    )
)

In [31]:
spoiler_punctuations_text = set(spoiler_punctuations_text)
spoiler_punctuations_summary = set(spoiler_punctuations_summary)

non_spoiler_punctuations_text = set(non_spoiler_punctuations_text)
non_spoiler_punctuations_summary = set(non_spoiler_punctuations_summary)

In [32]:
# Remove punctuation
spoiler_text_clean_tokens = [t for t in spoiler_tokens_text if t not in spoiler_punctuations_text]
spoiler_summary_clean_tokens = [t for t in spoiler_tokens_summary if t not in spoiler_punctuations_summary]

non_spoiler_text_clean_tokens = [t for t in not_spoiler_tokens_text if t not in non_spoiler_punctuations_text]
non_spoiler_summary_clean_tokens = [t for t in not_spoiler_tokens_summary if t not in non_spoiler_punctuations_summary]

##### Remove Stopwords

In [33]:
stop_word_regex = '|'.join(['^{}$'.format(s) for s in stopwords.words('english')])

stop_word_regex = re.compile(stop_word_regex)

In [35]:
spoiler_stop_words_text = list(filter(stop_word_regex.match, spoiler_text_clean_tokens))
spoiler_stop_words_summary = list(filter(stop_word_regex.match, spoiler_summary_clean_tokens))

non_spoiler_stop_words_text = list(filter(stop_word_regex.match, non_spoiler_text_clean_tokens))
non_spoiler_stop_words_summary = list(filter(stop_word_regex.match, non_spoiler_summary_clean_tokens))
#----------------------------------------------------
spoiler_stop_words_text = set(spoiler_stop_words_text)
spoiler_stop_words_summary = set(spoiler_stop_words_summary)

non_spoiler_stop_words_text = set(non_spoiler_stop_words_text)
non_spoiler_stop_words_text = set(non_spoiler_stop_words_text)

In [36]:
# Remove stopwords
spoiler_text_clean_tokens = [t for t in spoiler_text_clean_tokens if t not in spoiler_stop_words_text]
spoiler_summary_clean_tokens = [t for t in spoiler_summary_clean_tokens if t not in spoiler_stop_words_summary]

non_spoiler_text_clean_tokens = [t for t in non_spoiler_text_clean_tokens if t not in non_spoiler_stop_words_text]
non_spoiler_summary_clean_tokens = [t for t in non_spoiler_summary_clean_tokens if t not in non_spoiler_stop_words_text]

Check the most frequent tokens after removing punctuation and stopwords.

In [38]:
# Check again the most frequent tokens for Spoilers and Non-spoilers
print('Clean Spoiler Tokens (full text):\n')

for t, f in Counter(spoiler_text_clean_tokens).most_common(10):

  print('{0:25} {1}'.format(t, f))

print()
#------------------------------------
print('Clean Spoiler Tokens (summary):\n')

for t, f in Counter(spoiler_summary_clean_tokens).most_common(10):

  print('{0:25} {1}'.format(t, f))

print()
#------------------------------------
print('Clean Non Spoiler Tokens (full text):\n')

for t, f in Counter(non_spoiler_text_clean_tokens).most_common(10):

  print('{0:25} {1}'.format(t, f))

print()
#------------------------------------
print('Clean Non Spoiler Tokens (summary):\n')

for t, f in Counter(non_spoiler_summary_clean_tokens).most_common(10):

  print('{0:25} {1}'.format(t, f))

print()

Clean Spoiler Tokens (full text):

I                         162801
's                        116823
movie                     98547
The                       79862
film                      74521
one                       44210
like                      40385
It                        33042
would                     28817
good                      28745

Clean Spoiler Tokens (summary):

movie                     3711
A                         3605
The                       2870
I                         2403
's                        2132
film                      1832
good                      1288
best                      999
Movie                     968
Not                       951

Clean Non Spoiler Tokens (full text):

I                         129352
movie                     83875
's                        80792
film                      62501
The                       59084
one                       34133
like                      29159
It                        27427
good 

We can observe that the possesive s does not give much information. Therefore we decided to consider it as a stopword and remove it.

In [39]:
spoiler_text_clean_tokens = [t for t in spoiler_text_clean_tokens if t != "'s"]
spoiler_summary_clean_tokens = [t for t in spoiler_summary_clean_tokens if t != "'s"]

non_spoiler_text_clean_tokens = [t for t in non_spoiler_text_clean_tokens if t != "'s"]
non_spoiler_summary_clean_tokens = [t for t in non_spoiler_summary_clean_tokens if t != "'s"]

In [40]:
######eminame sto lemmatization stemming lab 2