In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from numpy.linalg import norm

from datetime import datetime
import re
from wordcloud import WordCloud, STOPWORDS

from collections import Counter

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm

import re
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import plotly.express as px

In [2]:
from six.moves import urllib
import os

S3_details = 'https://ready2bool-spoileralert-hackgt.s3.amazonaws.com/IMDB_movie_details.json'
S3_reviews = 'https://ready2bool-spoileralert-hackgt.s3.amazonaws.com/IMDB_reviews.json'
details_path = os.path.join("datasets", "moviedetails")
reviews_path = os.path.join("datasets", "reviews")


def load_path(path, src, type_f):
    if not os.path.isdir(path):
      os.makedirs(path)
    json_path = os.path.join(path, type_f)
    urllib.request.urlretrieve(src, json_path)
    return pd.read_json(json_path, lines=True)

In [3]:
df_reviews = load_path(path=reviews_path, src=S3_reviews, type_f="reviews.json")
df_details = load_path(path=details_path, src=S3_details, type_f="details.json")

In [4]:
df_details

Unnamed: 0,movie_id,plot_summary,duration,genre,rating,release_date,plot_synopsis
0,tt0105112,"Former CIA analyst, Jack Ryan is in England wi...",1h 57min,"[Action, Thriller]",6.9,1992-06-05,"Jack Ryan (Ford) is on a ""working vacation"" in..."
1,tt1204975,"Billy (Michael Douglas), Paddy (Robert De Niro...",1h 45min,[Comedy],6.6,2013-11-01,Four boys around the age of 10 are friends in ...
2,tt0243655,"The setting is Camp Firewood, the year 1981. I...",1h 37min,"[Comedy, Romance]",6.7,2002-04-11,
3,tt0040897,"Fred C. Dobbs and Bob Curtin, both down on the...",2h 6min,"[Adventure, Drama, Western]",8.3,1948-01-24,Fred Dobbs (Humphrey Bogart) and Bob Curtin (T...
4,tt0126886,Tracy Flick is running unopposed for this year...,1h 43min,"[Comedy, Drama, Romance]",7.3,1999-05-07,Jim McAllister (Matthew Broderick) is a much-a...
...,...,...,...,...,...,...,...
1567,tt0289879,Evan Treborn grows up in a small town with his...,1h 53min,"[Sci-Fi, Thriller]",7.7,2004-01-23,"In the year 1998, Evan Treborn (Ashton Kutcher..."
1568,tt1723811,Brandon is a 30-something man living in New Yo...,1h 41min,[Drama],7.2,2012-01-13,"Brandon (Michael Fassbender) is a successful, ..."
1569,tt5013056,Evacuation of Allied soldiers from the British...,1h 46min,"[Action, Drama, History]",8.1,2017-07-21,The film alternates between three different pe...
1570,tt0104014/,"For a while now, beautiful 24-year-old Diana B...",1h 33min,"[Comedy, Drama]",5.3,1992-02-21,


In [5]:
df_reviews

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
0,10 February 2006,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.
1,6 September 2000,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.
2,3 August 2001,tt0111161,ur1285640,True,I believe that this film is the best story eve...,8,The best story ever told on film
3,1 September 2002,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?
4,20 May 2004,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted"
...,...,...,...,...,...,...,...
573908,8 August 1999,tt0139239,ur0100166,False,"Go is wise, fast and pure entertainment. Assem...",10,The best teen movie of the nineties
573909,31 July 1999,tt0139239,ur0021767,False,"Well, what shall I say. this one´s fun at any ...",9,Go - see the movie
573910,20 July 1999,tt0139239,ur0392750,False,"Go is the best movie I have ever seen, and I'v...",10,It's the best movie I've ever seen
573911,11 June 1999,tt0139239,ur0349105,False,Call this 1999 teenage version of Pulp Fiction...,3,Haven't we seen this before?


In [6]:
try : 
  df_reviews = df_reviews.drop(labels=['review_date', 'movie_id', 'user_id', 'rating', 'review_summary'], axis=1)
except :
  print("Columns Removed")

In [7]:
df_reviews = df_reviews.sample(40000)

In [8]:
df_reviews.loc[df_reviews.is_spoiler==True, 'is_spoiler'] = 1
df_reviews.loc[df_reviews.is_spoiler==False, 'is_spoiler'] = 0
df_reviews.is_spoiler.value_counts()

0    29450
1    10550
Name: is_spoiler, dtype: int64

In [9]:
import string
df_reviews['review_text'] = df_reviews['review_text'].apply(lambda x: x.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))))
df_reviews

Unnamed: 0,is_spoiler,review_text
206179,1,Goodness gravy gracious I don t even know whe...
267157,0,After the epic and vaunted Dogtooth Yorgos ...
44725,0,I ve now seen BR 2049 two times First in IMAX...
404660,1,I loved the first movie it was the highlight o...
244356,0,Lets just be honest the only thing good about...
...,...,...
9118,0,There is no part of The Dark Knight that can...
565480,1,Again a funny Bond spoof with more spoofing th...
25574,0,Kurosawa is one of my favorite directors And T...
113095,0,Get Out is easily the best movie of 2017 mana...


In [10]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')
df_reviews['review_text'] = df_reviews['review_text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/arvind_anand1123/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
df_reviews

pandas.core.frame.DataFrame

In [12]:
from nltk.stem import PorterStemmer
st = PorterStemmer()
df_reviews['review_text'] = df_reviews['review_text'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

In [13]:
df_reviews

Unnamed: 0,is_spoiler,review_text
206179,1,good gravi graciou I even know begin berat tra...
267157,0,after epic vaunt dogtooth yorgo lanthimo recol...
44725,0,I seen BR 2049 two time first imax soni 4k the...
404660,1,I love first movi highlight summer So natur I ...
244356,0,let honest thing good otherwis terribl movi ha...
...,...,...
9118,0,there part the dark knight go unappreci from h...
565480,1,again funni bond spoof spoof ever it funni lad...
25574,0,kurosawa one favorit director and toshiro mifu...
113095,0,get out easili best movi 2017 manag get highli...


In [14]:
import re
regex = r'\d+'
df_reviews['review_text'] = df_reviews['review_text'].apply(lambda x:" ".join(re.sub(regex, 'numbers', word) for word in x.split()))

In [15]:
df_reviews

Unnamed: 0,is_spoiler,review_text
206179,1,good gravi graciou I even know begin berat tra...
267157,0,after epic vaunt dogtooth yorgo lanthimo recol...
44725,0,I seen BR numbers two time first imax soni num...
404660,1,I love first movi highlight summer So natur I ...
244356,0,let honest thing good otherwis terribl movi ha...
...,...,...
9118,0,there part the dark knight go unappreci from h...
565480,1,again funni bond spoof spoof ever it funni lad...
25574,0,kurosawa one favorit director and toshiro mifu...
113095,0,get out easili best movi numbers manag get hig...


In [16]:
# freq = pd.Series(' '.join(df_reviews['review_text']).split()).value_counts()
# low_freq = list(freq.loc[freq<5].index)
# df_reviews['review_text'] = df_reviews['review_text'].apply(lambda x: " ".join(x for x in x.split() if x not in low_freq))

In [17]:
# df_reviews

In [18]:
# words = pd.Series(' '.join(df_reviews['review_text']).split()).unique()
# words

In [29]:
spoilers = df_reviews.loc[df_reviews['is_spoiler'] == 1]
not_spoilers = df_reviews.loc[df_reviews['is_spoiler'] == 0]

X = list(df_reviews['review_text'])
y = list(df_reviews['is_spoiler'])

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.8, random_state=42)
X_train_t = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [32]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

clf = RandomForestClassifier()
clf.fit(X_train_t, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [33]:
(y_true, y_pred) = (y_test, clf.predict(X_test))
print (classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.74      1.00      0.85     23569
           1       0.74      0.02      0.03      8431

    accuracy                           0.74     32000
   macro avg       0.74      0.51      0.44     32000
weighted avg       0.74      0.74      0.63     32000



In [34]:
import pickle

pickle.dump(clf, open('model.pkl', 'wb'))
pickle.dump(X_train, open('train.pkl', 'wb'))

In [None]:
# df_reviews = pd.merge(df_reviews,df_details.drop('rating',axis=1),on='movie_id')[df_reviews.columns.values]

In [None]:
# df_reviews.is_spoiler.value_counts()

In [None]:
# le = preprocessing.LabelEncoder()
# df_reviews.is_spoiler = le.fit_transform(df_reviews.is_spoiler)

In [None]:
# # stratify
# spoilers = df_reviews[df_reviews.is_spoiler == 1].sample(frac=1).reset_index(drop=True)
# others = df_reviews[df_reviews.is_spoiler == 0].sample(frac=1).reset_index(drop=True)

# # train set
# df_reviews = pd.concat([spoilers.head(35000), others.head(100000)]).sample(frac=1).reset_index(drop=True)

# # test set
# test = pd.concat([spoilers.tail(1500), others.tail(3000)]).sample(frac=1).reset_index(drop=True)

In [None]:
# test

In [None]:
# movie_spoiler_counts = df_reviews.groupby(by='movie_id').is_spoiler.agg(['sum']).reset_index().rename(columns={'sum':'movie_spoiler_count'})
# movie_total_counts = df_reviews.groupby(by='movie_id').is_spoiler.agg(['count']).reset_index().rename(columns={'count':'movie_total_count'})
# df_ratio = pd.merge(movie_spoiler_counts, movie_total_counts, on='movie_id')
# df_ratio['movie_spoiler_ratio'] = df_ratio.movie_spoiler_count / df_ratio.movie_total_count
# df_ratio.drop(['movie_spoiler_count','movie_total_count'], axis=1, inplace=True)
# display(df_ratio)
# df_reviews = pd.merge(df_reviews, df_ratio, on='movie_id')

In [None]:
# df_ratio.shape

In [None]:
# df_temp = pd.DataFrame({"label":["spoiler", "non spoiler"],
#                       "movie_spoiler_ratio":[df_reviews[df_reviews.is_spoiler==1].movie_spoiler_ratio.mean(), 
#                                             df_reviews[df_reviews.is_spoiler==0].movie_spoiler_ratio.mean()]})
# sns.barplot(x = "label", y = "movie_spoiler_ratio", data = df_temp);

In [None]:
# user_spoiler_counts = df_reviews.groupby(by='user_id').is_spoiler.agg(['sum']).reset_index().rename(columns={'sum':'user_spoiler_count'})
# user_total_counts = df_reviews.groupby(by='user_id').is_spoiler.agg(['count']).reset_index().rename(columns={'count':'user_total_count'})
# user_spoiler_ratio = pd.merge(user_spoiler_counts, user_total_counts, on='user_id')
# user_spoiler_ratio['user_spoiler_ratio'] = user_spoiler_ratio.user_spoiler_count / user_spoiler_ratio.user_total_count
# user_spoiler_ratio.drop(['user_spoiler_count','user_total_count'], axis=1, inplace=True)
# display(user_spoiler_ratio.sample(frac=1))
# df_reviews = pd.merge(df_reviews, user_spoiler_ratio, on='user_id')

In [None]:
# df_temp = pd.DataFrame({"label":["spoiler", "non spoiler"], 
#                         "user_spoiler_ratio":[df_reviews[df_reviews.is_spoiler==1].user_spoiler_ratio.mean(), 
#                                             df_reviews[df_reviews.is_spoiler==0].user_spoiler_ratio.mean()]})
# sns.barplot(x = "label", y = "user_spoiler_ratio", data = df_temp);

In [None]:
# df_reviews.user_spoiler_ratio = (df_reviews.user_spoiler_ratio >= 0.1) + 0

In [None]:
# def formatReviewDate(review_date):
#     return datetime.strptime(review_date, '%d %B %Y').date()

# def formatReleaseDate(release_date):
#     date = None
#     try:
#         date = datetime.strptime(release_date, '%Y-%m-%d').date()
#     except:
#         try:
#             date = datetime.strptime(release_date+'-01', '%Y-%m-%d').date()
#         except:
#             date = datetime.strptime(release_date+'-01-01', '%Y-%m-%d').date()
#     return date

In [None]:
# df_reviews['review_date'] = df_reviews.review_date.apply(formatReviewDate)
# df_details['release_date'] = df_details.release_date.apply(formatReleaseDate)

In [None]:
# merged_df_reviews = pd.merge(df_reviews,df_details,on='movie_id')
# merged_df_reviews['review_relevance'] = (merged_df_reviews.review_date - merged_df_reviews.release_date).apply(lambda x: abs(x.days))

In [None]:
# print("Mean recency (spoilers) =",merged_df_reviews[merged_df_reviews.is_spoiler==1].review_relevance.mean())
# print("Mean recency (non-spoilers) =",merged_df_reviews[merged_df_reviews.is_spoiler==0].review_relevance.mean())

In [None]:
# genre_names = np.unique(np.array(' '.join(df_details.genre.str.join(' ')).split()))
# print(genre_names)

In [None]:
# df_genre = pd.DataFrame();
# for genre in genre_names:
#     df_genre[genre.lower()] = 0

In [None]:
# for index,row in df_details.iterrows():
#     details = df_details[df_details.movie_id == row['movie_id']]
#     df_genre.at[index,'movie_id'] = row['movie_id']
#     for genre in genre_names:
#         df_genre.at[index, genre.lower()] = int(genre in details['genre'].tolist()[0])

In [None]:
# df_reviews_temp = pd.merge(df_reviews, df_genre, on="movie_id")

In [None]:
# e = 0.001
# genre_spoiler_ratio = np.zeros(len(genre_names))
# for i,g in enumerate(genre_names):
#     genre_spoiler_ratio[i] = df_reviews_temp[(df_reviews_temp.is_spoiler==1) & (df_reviews_temp[g.lower()]==1)].shape[0] / (df_reviews_temp[df_reviews_temp[g.lower()]==1].shape[0]+e)

In [None]:
# ax = sns.barplot(x=genre_spoiler_ratio,y=genre_names)
# ax.set(xlabel="Spoiler ratio");
# plt.show();

In [None]:
# e = 0.001
# selected = ["Action","Adventure","Fantasy","Horror","Mystery","Sci-Fi","Thriller"]

# genre_2_labels = []
# genre_2_ratios = []

# i=0
# while i < len(selected):
#     j = i+1
#     while j < len(selected):
#         genre_2_labels.append(selected[i]+"+"+selected[j])
#         spoilers = df_reviews_temp[(df_reviews_temp[selected[i].lower()]==1) & (df_reviews_temp[selected[j].lower()]==1) & df_reviews_temp.is_spoiler].shape[0]
#         total = df_reviews_temp[(df_reviews_temp[selected[i].lower()]==1) & (df_reviews_temp[selected[j].lower()]==1)].shape[0]
#         genre_2_ratios.append(spoilers / (total+e))
#         j+=1
#     i+=1
    
# ax = sns.barplot(x=genre_2_ratios,y=genre_2_labels)
# ax.set(xlabel="Spoiler ratio");
# plt.show();

In [None]:
# def isListSubset(a,b):
#     count = 0
#     for i,v in enumerate(a):
#         if v in b:
#             count += 1
#     return count == len(a)

# def getGenreRatio(genres):
#     return isListSubset(["Fantasy","Sci-Fi"], genres) or isListSubset(["Adventure","Mystery"], genres) or isListSubset(["Action","Mystery","Sci-Fi"], genres) or isListSubset(["Adventure","Horror","Thriller"], genres) or isListSubset(["Adventure","Thriller"], genres) or isListSubset(["Fantasy","Mystery"], genres)


In [None]:
# df_reviews['genre_spoiler_ratio'] = pd.merge(df_reviews,df_details,on="movie_id").genre.apply(getGenreRatio)+0

In [None]:
# sns.boxplot(x=df_reviews.is_spoiler,y=df_reviews.rating);

In [None]:
# nltk.download('stopwords')
# stopwords = set(stopwords.words('english'))
# stopwords_dict = Counter(stopwords)

# def preprocess_text(review):
#     review = review.lower() # Convert to lowercase
#     review = re.sub('[^a-zA-Z]',' ', review) # Remove words with non-letter characters
#     words = review.split()
#     words = [word for word in words if word not in stopwords_dict] # Remove stop words
#     review = " ".join(words)
#     return review

In [None]:
# df_reviews.review_text = df_reviews.review_text.apply(preprocess_text)

In [None]:
# S3_GLOVE = 'https://ready2bool-spoileralert-hackgt.s3.amazonaws.com/glove.6B.50d.txt'
# glove_path = os.path.join("datasets", "glove")
# if not os.path.isdir(glove_path):
#   os.makedirs(glove_path)
# txt_file = os.path.join(glove_path,'glove.6B.50d.txt')
# urllib.request.urlretrieve(S3_GLOVE, txt_file)

# embeddings_index = dict()
# f = open(txt_file)
# for line in f:
#     values = line.split()
#     word = values[0]
#     coefs = np.asarray(values[1:], dtype='float32')
#     embeddings_index[word] = coefs
# f.close()

In [None]:
# def getAverageWordEmbedding(text):
#     words = text.split()
#     n = 0
#     sumEmbed = np.zeros(50)
    
#     for word in words:
#         if word in embeddings_index:
#             sumEmbed += embeddings_index[word]
#             n += 1
    
#     avgEmbed = sumEmbed / n
#     return avgEmbed
    
# def EuclideanDist(e1, e2):
#     return np.sum(np.square(e1-e2), axis = 1)

In [None]:
# plot_review_merge = pd.merge(df_reviews, df_details[['movie_id','plot_summary']], on = 'movie_id')
# review_embed = plot_review_merge.review_text.apply(getAverageWordEmbedding)
# plot_embed = plot_review_merge.plot_summary.apply(getAverageWordEmbedding)

In [None]:
# euclideans = EuclideanDist(np.stack(review_embed), np.stack(plot_embed))

In [None]:
# plot_review_merge['euclideans'] = euclideans

# df_temp = pd.DataFrame({"label":["spoiler", "non spoiler"], 
#                         "euclidean_dist":[plot_review_merge[plot_review_merge.is_spoiler==1].euclideans.mean(), 
#                                             plot_review_merge[plot_review_merge.is_spoiler==0].euclideans.mean()]})
# sns.barplot(x = "label", y = "euclidean_dist", data = df_temp);

In [None]:
# x = df_reviews[['movie_spoiler_ratio','user_spoiler_ratio','genre_spoiler_ratio','rating']]
# x['euclideans'] = euclideans
# x = x.fillna(0)

In [None]:
# model_svm = svm.SVC(gamma='scale',C=10)
# model_svm.fit(x, df_reviews.is_spoiler)

In [None]:
# predictions = model_svm.predict(x)

In [None]:
# print(classification_report(df_reviews.is_spoiler, predictions))
# print(confusion_matrix(df_reviews.is_spoiler, predictions))

In [None]:
# # user_spoiler_ratio
# test1 = pd.merge(test, user_spoiler_ratio, how = 'left', on = 'user_id')

# # movie_spoiler_ratio
# test1 = pd.merge(test1, df_ratio, how = 'left', on = 'movie_id')

# # genre_spoiler_ratio
# test1['genre_spoiler_ratio'] = pd.merge(test1, df_details,on="movie_id").genre.apply(getGenreRatio)+0

# # euclidean distances between review and plot
# test1.review_text = test1.review_text.apply(preprocess_text)
# plot_review_merge_test = pd.merge(test1, df_details[['movie_id','plot_summary']], on = 'movie_id')
# review_embed_test = plot_review_merge_test.review_text.apply(getAverageWordEmbedding)
# plot_embed_test = plot_review_merge_test.plot_summary.apply(getAverageWordEmbedding)
# test1['euclideans'] = EuclideanDist(np.stack(review_embed_test), np.stack(plot_embed_test))

In [None]:
# x_test = test1[['movie_spoiler_ratio','user_spoiler_ratio','genre_spoiler_ratio','euclideans','rating']]
# x_test = x_test.fillna(0)

In [None]:
# predictions_new = model_svm.predict(x_test)

In [None]:
# print(classification_report(test1.is_spoiler, predictions_new))
# print(confusion_matrix(test1.is_spoiler, predictions_new))
# print(x_test)

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# forest = RandomForestClassifier()
# forest.fit(x, df_reviews.is_spoiler)
# predictions_forest = forest.predict(x_test)

In [None]:
# print(classification_report(test1.is_spoiler, predictions_forest))
# print(confusion_matrix(test1.is_spoiler, predictions_forest))