In [None]:
# MIT License

# Copyright (c) 2022 Alexandru Pascu and Stefano Li Pira

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

**Measuring the Stereotype of IMDB movies and series using a BERT NLP model Version: 1.0 Date: 2022-10-7**

> This notebook contains complete code to fine-tune the transformers language model BERT to perform categorization of movies and series into a label especially the probability of them being a Stereotype title. We will use BERT to categorize titles from IMDB datasets in stereotypes and non-stereotypes (the catergories can be easily changed for other usecases). Data needed for this matter is the users keywords of titles and plots (we are excluding game-show, talk-show, reality-tv and filtering for drama and comedy. We are including only episodes with at least 1 rating for relevance). BigQuery has been used for some data cleaning purposes.

**In this notebook, you will:**

> Download and load a dataset of series title episodes with various information about them such as actors, production, ratings, plots, keywords and others. Based on the users existing keywords titles will be put into a category (easily changeble) and used for training and then labeling others (keywords can be changed and use in different ways). The dataset was created from the Goodreads dataset. Load a BERT model from the Transformers Library, build your own model by combining BERT with a classifier. Train your own model on a GPU, fine-tuning BERT on the training dataset. Use the trained model to classify titles as Stereotype using plots. Use the predicted categorization probabilities to run regressions and find insights. You can use this notebook on your own data from other domains and categories, by ensuring that your input and prediction datasets have the same structure and column names as those provided with this notebook. Alternatively the later can be modified as well to suit your use-case.

**File Paths**

Mount your drive folder in the colab.

The folllowing cell should be modified if you are using different input/prediction files or if you have changed the folder name.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

**Install and load libraries**

In [None]:
# install the Cinemagoer API
!pip install cinemagoer
# install the Simple Transformers library
!pip install simpletransformers

In [None]:
# General Packages #
import os
import pandas as pd
import numpy as np

# CSV handling library
import csv

# Cinemagoer library
from imdb import Cinemagoer, IMDbError

# The classification Model used to label data
from simpletransformers.classification import ClassificationModel

# TQDM to Show Progress Bars #
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook

# SKLearn libraries for splitting sample and validation
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score
from sklearn.metrics import recall_score, f1_score, confusion_matrix
from sklearn.metrics import classification_report

# Matplot and SkLearn libraries for doing regressions
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Additional Libraries that we are using only in this notebook
import torch
import gc

**Cinemagoer API**

The following code blocks are to help with understanding better what data Cinemagoer (https://cinemagoer.github.io/) is capable of retrivieving from IMDB (https://www.imdb.com/). We also provide access to how we handled our data collection process.

In [None]:
# create an instance of the Cinemagoer class
ia = Cinemagoer()

# get a movie and print its director(s)
the_matrix = ia.get_movie('0720229')
for director in the_matrix['directors']:
  print(director['name'])

# show all information that are currently available for a movie
print(sorted(the_matrix.keys()))

# show all information sets that can be fetched for a movie
print(ia.get_movie_infoset())

# update a Movie object with more information
ia.update(the_matrix, ['technical'])

# show which keys were added by the information set
print(the_matrix.infoset2keys['technical'])

# print one of the new keys
print(the_matrix.get('tech'))

# print different information
print(the_matrix.get('plot'))
print(the_matrix.get('plot outline'))
print(the_matrix.get('synopsis'))
print(the_matrix.get('keywords'))
print(the_matrix.get('title'))
print(the_matrix.get('production companies'))

In the following blocks you can use the code to get data neede for the title codes you want. We were interested in plots, keywords, production companies, and countries. 

In [None]:
# initialize the header row 
headers = ['production companies', 'plot', 'plot outline', 'synopsis',
           'keywords', 'countries', 'title_code']
# initialize data
data = ['', '', '', '', '', '', '']

# add the csv file with the title codes to get data on, in colab files on left
file = open("title_codes.csv")
csvreader = csv.reader(file)
header = next(csvreader)
rows = []
# add all the codes in the rows variable
for row in csvreader:
  rows.append(row)
file.close()

# creata a folder to store the data from imdb
f = open('titles_data.csv', 'w', encoding='UTF8', newline='')
writer = csv.writer(f)
writer.writerow(headers)

# iterate through all the the titles codes and getting data interested in
for x in rows:
  try:
    x_string = ' '.join(map(str, x))
    the_matrix = ia.get_movie(x_string)
    if the_matrix.get('production companies') is None:
      data[0] = "NULL"
    else:
      data[0] = the_matrix.get('production companies')
    if the_matrix.get('plot') is None:
      data[1] = "NULL"
    else:
      data[1] = the_matrix.get('plot')[0]
    if the_matrix.get('plot outline') is None:
      data[2] = "NULL"
    else:
      data[2] = the_matrix.get('plot outline')
    if the_matrix.get('synopsis') is None:
      data[3] = "NULL"
    else:
      data[3] = the_matrix.get('synopsis')
    # keywords weren't adding correctly; see below the solution for getting them
    if the_matrix.get('keywords') is None:
      data[4] = "NULL"
    else:
      data[4] = the_matrix.get('keywords')
    if the_matrix.get('countries') is None:
      data[5] = "NULL"
    else:
      data[5] = the_matrix.get('countries')
    data[6] = "tt" + x_string
    writer.writerow(data)
  except IMDbError as e:
    print(e)
    continue

f.close()

We discovered a problem in the implementation of the API hence we used the following code to get the keyowrds of the title as well. They require a special approach.

In [None]:
# initialize the header row 
headers = ['title_codes', 'keywords_title']
# initialize data
data = ['', '']

# add the csv file with the title codes to get data on, in colab files on left
file = open("title_codes.csv")
csvreader = csv.reader(file)
rows = []
# add all the codes in the rows variable
for row in csvreader:
  rows.append(row)
file.close()

# creata a folder to store the data from imdb
f = open('titles_keywords.csv', 'w', encoding='UTF8', newline='')
writer = csv.writer(f)
writer.writerow(headers)

# get all the keywords of the titles
for x in rows:
  x_string = ' '.join(map(str, x))
  data[0] = "tt" + x_string
  try:
    title = ia.get_movie(x_string, info='keywords')
    try:
      data[1] = title['keywords']
    except:
      data[1] = "NULL"
  except IMDbError as e:
    print(e)
    continue
  writer.writerow(data)
f.close

Below we wanted to have the opposite approach, we had a couple of specific keywords area that we were looking for similar ones in the imdb database. Afterwards we used them to get the title codes associated with all these keywords.

In [None]:
# creata a folder to store the data from imdb
f = open('tiles_from_keywords.csv', 'w', encoding='UTF8', newline='')
writer = csv.writer(f)
data = ['', '']

# compute the list of 'categories' that you want to get similar keywords for
keywordsList = ['stereotype', 'sexism', 'homophobic',
                'racism', 'discrimination']
# iterate through your list and get similar keywords
for keywordIndividual in keywordsList:
  keywords = ia.search_keyword(keywordIndividual)
  print(keywords)
  # iterate through the list of similar keywords and get title codes for them
  for keywwordsSimilar in keywords:
    movies = ia.get_keyword(keywwordsSimilar)
    # save all the titles in a csv file
    for title in movies:
      data[0] = 'tt' + ''.join(title.movieID)
      data[1] = keywwordsSimilar
      writer.writerow(data)
f.close()

For using them to train the model as well, get their plots and all of the keywords they have for each title code found earlier.

In [None]:
# initialize the header row 
headers = ['title_codes', 'keywords_title', 'plot']
# initialize data
data = ['', '', '']

# open the file containing the title codes and add them to a variable
file = open("tiles_from_keywords.csv")
csvreader = csv.reader(file)
rows = []
for row in csvreader:
  rows.append(row)
file.close()

# creata a folder to store the data from imdb
f = open('plots_and_keywords_extra_titles.csv', 'w',
         encoding='UTF8', newline='')
writer = csv.writer(f)
writer.writerow(headers)

# get all the keywords and plots of the titles
for x in rows:
  data[0] = ''.join(map(str, x))
  x_string = data[0][2:]
  try:
    title = ia.get_movie(x_string, info=['keywords', 'plot'])
    try:
      data[1] = title['keywords']
    except:
      data[1] = "NULL"
    try:
      data[2] = title['plot']
    except:
      data[2] = "NULL"
  except IMDbError as e:
    print(e)
    continue
  writer.writerow(data)
f.close

**Prepare data for the ML model**

From now on the focus starts to be on preparing the data for the ML model. Preparing the labels, the needed columns from our dataset (namely plots, keywords, title codes) and associating the Label Stereotype or not.

In [None]:
# have a list of keywords that you want to find similars
labelList = ['stereotype', 'sexism', 'homophobic', 'racism', 'discrimination']
keywords = ([[], [], [], [], []])

# iterate through the keywords list to get the similar ones
for i in range(len(labelList)):
  print(labelList[i])
  keywords[i] = ia.search_keyword(labelList[i])
  print(keywords[i])
print(keywords)

Upload the data that you (in the csv format, with each coloumn in one file - make sure the order is preserved. Alternatively use python to extract the needed information, we used BigQuery with SQL to do this job) have on Drive and load it in dataframes.

In [None]:
# making dataframes
df = pd.read_csv('/content/drive/MyDrive/titles.csv', header = 0, 
                 delimiter="\t", quoting= 3, encoding='utf-8')
df_keywords = pd.read_csv('/content/drive/MyDrive/keywords.csv',header = 0,
                          delimiter="\t", quoting= 3, encoding='utf-8')
df_plot = pd.read_csv('/content/drive/MyDrive/plot.csv', header = 0,
                      delimiter="\t", quoting= 3, encoding='utf-8')

# combining all relevant data into one dataframe
df['keywords'], df['plot'] = [df_keywords, df_plot]

# output the dataframe
display(df)

In [None]:
# making dataframe 
df_extra = pd.read_csv('/content/drive/MyDrive/extra_title.csv', header = 0,
                       delimiter="\t", quoting= 3, encoding='utf-8')
df_extra_keywords = pd.read_csv('/content/drive/MyDrive/extra_keywords.csv',
                                header = 0, delimiter="\t", quoting= 3,
                                encoding='utf-8')
df_extra_plot = pd.read_csv('/content/drive/MyDrive/extra_plot.csv', header = 0,
                            delimiter="\t", quoting= 3, encoding='utf-8')

# combining all relevant data into one dataframe
df_extra['keywords'], df_extra['plot'] = [df_extra_keywords, df_extra_plot]

# output the dataframe
display(df_extra)

Prepare the keywords in the big areas you are interested in and label the data with them.

In [None]:
# initialize the pattern list with a list for each keyword area looked for
pattern = ([[], [], [], [], []])

# transform the similar keywords got from imdb in order to work with them
for i in range(len(keywords)):
  pattern[i] = '|'.join(keywords[i])
  print(pattern[i])
print(pattern)

In [None]:
# iterate through the 5 areas of all the keywords and save in the dataframe
# the presence with True and False
for i in range(len(keywords)):
  df[i] = df.keywords.str.contains(pattern[i])
display(df)

In [None]:
# iterate through the list of True and False and label them with 1 for True
# 0 for otherwise and rename the columns of the dataframe to represent the 
# keyword area
for i in range(len(keywords)):
  df.loc[df[i] == True, i] = 1
  df.loc[df[i] == False, i] = 0

df.rename(columns={0: 'stereotype', 1: 'sexism', 2: 'homophobic', 
                   3: 'racism', 4: 'discrimination'}, inplace = True)
display(df)

In [None]:
# do the same with the extra titles
for i in range(len(keywords)):
  df_extra.loc[df_extra[i] == True, i] = 1
  df_extra.loc[df_extra[i] == False, i] = 0

df_extra.rename(columns={0: 'stereotype', 1: 'sexism', 2: 'homophobic',
                         3: 'racism', 4: 'discrimination'}, inplace = True)
display(df_extra)

In [None]:
# get stats on all the keywords area looked for in the initial data
print(df['stereotype'].value_counts())
print(df['sexism'].value_counts())
print(df['homophobic'].value_counts())
print(df['racism'].value_counts())
print(df['discrimination'].value_counts())

In [None]:
# do the same with the extra titles
print(df_extra['stereotype'].value_counts())
print(df_extra['sexism'].value_counts())
print(df_extra['homophobic'].value_counts())
print(df_extra['racism'].value_counts())
print(df_extra['discrimination'].value_counts())

In [None]:
# create a new label column
df['Type'] = ''
# label the titles with the keyword area that it has
for i in ['stereotype', 'sexism', 'homophobic', 'racism', 'discrimination']:
  df.loc[df[i] == 1, 'Type'] = df['Type'] + ' ' + i
display(df)

In [None]:
# do the same with the extra titles
df_extra['Type'] = ''
for i in ['stereotype', 'sexism', 'homophobic', 'racism', 'discrimination']:
  df_extra.loc[df_extra[i] == 1, 'Type'] = df_extra['Type'] + ' ' + i
display(df_extra)

In [None]:
# create the join dataframe having the initial data
joint_data = df.append([df_extra])
# separate the records that have at least one label from the joint data
data_types_joint_data = joint_data.loc[joint_data['Type'] != '']
# separate the records that have no label from the initial data
data_nontypes = joint_data[df['Type'] == '']
# randomly select records with no labels
# you have to add up the records with label and change n with that number
data_nontypes = data_nontypes.sample(n = 3688) 
# have a dataframe with both the records with and without label selected
training_joint_data = data_types_joint_data.append([data_nontypes])
display(training_joint_data)

In [None]:
# get stats on all the keywords area looked for in the joint data
print(joint_data['stereotype'].value_counts())
print(joint_data['sexism'].value_counts())
print(joint_data['homophobic'].value_counts())
print(joint_data['racism'].value_counts())
print(joint_data['discrimination'].value_counts())

Get the labels and the plots ready for training the model.

In [None]:
# transform the label column from the joint data in a list with the values
# if there is no value save it as 0 otherwise as 1
stereotype_encoding_joint_data = training_joint_data['Type'].values.tolist()
for i in range(len(stereotype_encoding_joint_data)):
  if stereotype_encoding_joint_data[i] == '':
    stereotype_encoding_joint_data[i] = 0
  else:
    stereotype_encoding_joint_data[i] = 1
print(stereotype_encoding_joint_data)

In [None]:
# in case you want to do multi label classification you need a matrix of them
class_array_joint_data = training_joint_data[['stereotype', 'sexism', 
                          'homophobic', 'racism', 'discrimination']].to_numpy()
for row in class_array_joint_data:
  if 1 in row:
    row = 1
  else:
    row = 0
print(class_array_joint_data)

In [None]:
# Store Data in Lists for Text Classification
IDs = np.array(training_joint_data['titleCodes'].values.tolist())
Plot_Text = training_joint_data['plot'].values.tolist()
Classes = stereotype_encoding_joint_data
print(Classes)

**Train a NLP Model**

Change the parameters of the Bert Classification model according to your dataset and needs. For best results we tested several different settings and in the a number of 3 epochs and a batch size of 16 gave us the best results on Colab with a GPU running. This is because Bert is already pre-trained on an immense dataset of books and wikipedia articles and you just need to fine-tune it for your usecase.

See for guidance and resources: https://simpletransformers.ai/docs/installation/

In [None]:
# Loop through K Folds and Repeat Cross Validation
NUM_OF_SPLITS = 5
KFoldSplitter = StratifiedKFold(n_splits = NUM_OF_SPLITS, shuffle = True,
                                random_state = 1)
        
for train_i, test_i in tqdm_notebook(KFoldSplitter.split(Plot_Text, Classes), 
                                            desc = 'Cross-Validating',
                                            leave = False,
                                            total = NUM_OF_SPLITS):
      
  # Select Rows in Data Based on Indexes [train_i, test_i]
  Y = np.array(Classes)

  Plot_Text_Array = np.array(Plot_Text)

  train_X, test_X = Plot_Text_Array[train_i], Plot_Text_Array[test_i]
  train_y, test_y = Y[train_i], Y[test_i]
  Train_IDs, Test_IDs = IDs[train_i], IDs[test_i]

  # Create Training Data in Paired Format (Nessesary for Transformers)
  TrainingDataframe = list(zip( list(train_X), list(train_y)))
  TestDataframe = list(zip( list(test_X), list(test_y)))

  train_df = pd.DataFrame(TrainingDataframe)
  train_df.columns = ["text", "labels"]

model = ClassificationModel("bert", "bert-base-uncased", use_cuda = True,
                            num_labels=2,                                
                                    args={'num_train_epochs':3,
                                          'train_batch_size':16,
                                          'overwrite_output_dir': True,
                                          'use_early_stopping':True,
                                          'do_lower_case':True, 
                                          'silent':True,
                                          'no_cache':True, 
                                          'no_save':True,
                                          "regression": False}
                                    )

# Train the Model
model.train_model(train_df)

# Predict on Holdout Sample
predictions, raw_outputs = model.predict( list(test_X) )

# Store Output
id_s = id_s + list(Test_IDs)
y_actual = y_actual + list(test_y)
y_predicted = y_predicted + list(predictions)

gc.collect()
torch.cuda.empty_cache()

In [None]:
# Compute the Share of AI Patents
Share = np.round(np.mean(y_predicted), 3)

# Calculate Model Performance Metrics
Accuracy = accuracy_score(y_actual, y_predicted)
ROC = roc_auc_score(y_actual, y_predicted)
Precision = precision_score(y_actual, y_predicted)
Recall = recall_score(y_actual, y_predicted)
F1 = f1_score(y_actual, y_predicted)
CM = confusion_matrix(y_actual, y_predicted)

FN = np.round(CM[0][0]/(CM[0][0] + CM[1][0]), 3)
FP = np.round(CM[0][1]/(CM[0][1] + CM[1][1]), 3)
TN = np.round(CM[1][0]/(CM[0][0] + CM[1][0]), 3)
TP = np.round(CM[1][1]/(CM[0][1] + CM[1][1]), 3)

# Add Classification Performance Metrics to List
RESULTS.append(['bert', Share, TP, FN, FP, TN,
                                            np.round(Accuracy, 3),
                                            np.round(ROC, 3),
                                            np.round(Precision, 3),
                                            np.round(Recall, 3),
                                            np.round(F1, 3)])

# Add Classification Results to List 
Classified_Values.append(list(zip(len(id_s)*['bert'],
                                  id_s, y_actual, y_predicted)))

In [None]:
# Convert List to Dataframe
RESULTS_TABLE = pd.DataFrame(RESULTS, columns = ["Name", "Share", 
                                                 "True-Positives", 
                                                 "False-Negatives",
                                                 "False-Positives", 
                                                 "True-Negatives",
                                                 "Accuracy", "AUC", 
                                                 "Precision", "Recall", "F1"] )

RESULTS_TABLE["Type"] = "Transformer"
RESULTS_TABLE = RESULTS_TABLE[["Name", "Type", "Share", "True-Positives", 
                               "False-Negatives", "False-Positives", 
                               "True-Negatives","Accuracy", "AUC", 
                               "Precision", "Recall", "F1"]]



# Output Results
RESULTS_TABLE.sort_values("Accuracy", ascending = False).to_csv('Transformer Classification Model Performance.csv')

# Display Results -- Out of Sample (Holdout) prediction -- Sorted by Accuracy
RESULTS_TABLE.sort_values("Accuracy", ascending = False)

In [None]:
# Output Classification Results for Training Dataset
# -- PREDICTED VALUES -- Out Of Sample (Holdout) Prediction

for i in range(0,len(Classified_Values), 1):

  Temp = pd.DataFrame(  Classified_Values[i],
                        columns = ['Model', 'id', 'Actual', 'Predicted'] )
  
  if i == 0: 
    name = Temp.head(1)['Model'][0]
    Temp = Temp[['id', 'Actual', 'Predicted']]
    Temp.columns = ['id', 'Actual', name]
    Final = Temp

  else: 

    name = Temp.head(1)['Model'][0]
    Temp = Temp[['id', 'Predicted']]
    Temp.columns = ['id', name]

    Final = Final.merge(Temp, on = ['id'])

# Save DataFrame # 
Final.to_csv("./Transformer Classification Results.csv")

**Predictions on new data**

Use the just fine-tuned bert model to test the clasifications (labelling) on a couple of plots and then on your dataset.

In [None]:
# try different plots to see what the model is predicting them
# change or add plots in the list to experience with others
predictions, raw_outputs = model.predict(["With the help of a German bounty-hunter, a freed slave sets out to rescue his wife from a brutal plantation-owner in Mississippi.", "Everyone knows that lions live in Africa and tigers live in Asia, right? Wrong. The world's last population of Asiatic lions live in northern India, 200 miles away from the nearest tigers. How did these two apex predators choose their different habitats?", "Carol Danvers becomes one of the universe's most powerful heroes when Earth is caught in the middle of a galactic war between two alien races.", "A pickup game with the family-owned pizza place across the street leads to Leela being the first female Blernsball player, but she has to struggle to avoid being the worst player ever.", "T'Challa, heir to the hidden but advanced kingdom of Wakanda, must step forward to lead his people into a new future and must confront a challenger from his country's past."])
print(predictions, raw_outputs)

In [None]:
# loading the csv in a dataframe 
df_to_predict = pd.read_csv('/content/drive/MyDrive/to_predict.csv', header = 0,
                            encoding='utf-8')

In [None]:
# split the data into smaller chunks if it is too big
# change the start and the end accordingly
start = 0
end = 16000
predict_list = df_to_predict['plot'].iloc[start:end].values.tolist()

In [None]:
# initialize numpy arrays for predictions
predict_outcome = np.array([])
predict_nonstereotype = np.array([])
predict_stereotype = np.array([])
# have the outcome, the stereotype and non stereotype measures
for i in range(len(predict_list)):
  # run the model and distribute the correct values to the right arrays
  predictions, raw_outputs = model.predict([predict_list[i]])
  predict_outcome = np.append(predict_outcome, predictions)
  predict_nonstereotype = np.append(predict_nonstereotype, raw_outputs[0][0])
  predict_stereotype = np.append(predict_stereotype, raw_outputs[0][1])

In [None]:
# select the chunk you're working with from those you want to run the model on
temp = df_to_predict.iloc[start:end]
# add the predictions to the dataframe
temp['Prediction'] = predict_outcome.tolist()
temp['NonStereotype'] = predict_nonstereotype.tolist()
temp['Stereotype'] = predict_stereotype.tolist()
# save the predictions in a csv on your drive
temp.to_csv('/content/drive/MyDrive/predicted.csv', encoding='utf-8')
print(temp)

**Regressions on the stereotype measures**

In [None]:
# loading the csv in a dataframe 
df_popularity = pd.read_csv('/content/drive/MyDrive/popularity_regression.csv',
                            header = 0, encoding='utf-8')
# randomise the sample order
df_popularity = df_popularity.sample(frac = 1).reset_index(drop = True)
display(df_popularity)

In [None]:
# setup the Linear Regression model and select the independent variables
X = df_popularity[['Stereotype', 'NonStereotype', 'prediction']]
# select the dependent variables
y = df_popularity[['averageRating', 'numVotes']]
# use fitting because there are no particular definitive differences in sample
model = LinearRegression(fit_intercept = True)
# fit the data
model.fit(X, y)
# run the predictions
df_popularity[['predicted_averageRating',
               'predicted_numVotes']] = model.predict(X)

In [None]:
# print results of the regression and the new dataframe with the predictions
print("Model intercept:", model.intercept_)
print("Model slope:    ", model.coef_[0])
print("Model coeficients", model.coef_)
display(df_popularity)
# save the results in a csv on Drive
df_popularity.to_csv('/content/drive/MyDrive/regression_popularity_results.csv',
                     encoding='utf-8')

In [None]:
# show the plot of the regression results
df_popularity[['averageRating', 'predicted_averageRating']].plot(alpha = 0.5);
plt.savefig('/content/drive/MyDrive/regression_popularity_Ratings.png')

In [None]:
# show the plot of the regression results
df_popularity[['numVotes','predicted_numVotes']].plot(alpha = 0.5);
plt.savefig('/content/drive/MyDrive/regression_popularity_Votes.png')

In [None]:
# loading the csv in a dataframe 
df_series = pd.read_csv('/content/drive/MyDrive/series_regression.csv',
                            header = 0, encoding='utf-8')
# randomise the sample order
df_series = df_series.sample(frac = 1).reset_index(drop = True)
display(df_series)

In [None]:
# clean the data and arrange it for analysis
df_series = df_series.dropna(subset = None)
# remove the string tt from the series codes if needed
# df_series["seriesCode"] = df_series["seriesCode"].str.replace("tt","")
# remove the trailing spaces
df_series["seasonNumber"] = df_series["seasonNumber"].str.replace(r'\\N', '0',
                                                                  regex = True)
df_series["episodeNumber"] = df_series["episodeNumber"].str.replace(r'\\N', '0',
                                                                    regex= True)
# transform the columns from strings to integers
df_series = df_series.astype({'seasonNumber':'int','episodeNumber':'int'})
# sort the data
df_series = df_series.sort_values(by=['seriesCode', 'seasonNumber',
                                      'episodeNumber'])
display(df_series)

In [None]:
# loading the series codes csv file in a dataframe 
df_series_codes = pd.read_csv('/content/drive/MyDrive/series_codes.csv',
                            header = 0, encoding='utf-8')
# sort the data
df_series_codes = df_series_codes.sort_values(by=['seriesCode'])
# create a list with the series codes
series_codes = df_series_codes["seriesCode"].values.tolist()
display(df_series_codes)

In [None]:
stereotypeNumber = [71, [1]]
temp = df_series.loc[(df_series["seriesCode"] == 'tt0040051')]
print(temp['prediction'].sum())
print(temp['prediction'].value_counts())
print(temp['prediction'].value_counts()[0])
stereotypeNumber.append(temp['prediction'].value_counts()[1] + temp['prediction'].value_counts()[0])
stereotypeNumber.append(33)
print(stereotypeNumber)
display(temp["seasonNumber"])

In [None]:
stereotypeNumber = []
episodesNumber = []
seasonNumber = []
for i in series_codes:
  temp = df_series.loc[(df_series["seriesCode"] == i)]
  # stereotypeNumber.append(temp['prediction'].value_counts()[1])
  # episodesNumber.append(len(temp))
  # seasonNumber.append(temp["seasonNumber"].max())
  display(temp, i)