# Leveraging Video-On-Demand streaming data for early forecast of a movie's success using Gradient Boosting Machines and advanced feature engineering techniques.
## Introduction
The goal of this project is to create a decision support system to aid movie investments at the early stage of a movie's production. The system predicts the success of a movie based on a streaming rank scoring measure by leveraging historical data from various sources. Using social network analysis and advanced natural language processing (NLP) techniques, the system automatically extracts several groups of features, including the “who” (cast and crew), the “what” (the plot)), as well as “hybrid” features that match “who” with “what”. In order to support investment decisions on a movie, the model has to be provided information that is available at the very early stage of the movie’s production. Consequently, our prediction of movie success can only leverage data that is available when a movie is still being planned. Predictions that are made right before or after the official release may have more data to use and get more accurate results, but they are too late for investors to make any meaningful decision.

# Setup

## Python libraries

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
#from sqlalchemy import create_engine
#import requests
#import json
#import time
#import difflib
from random import choice
from itertools import combinations
from TextRank4Keyword import TextRank4Keyword # PageRank based keyword extraction
# import stopwords from nltk
import nltk
import networkx as nx # Graph analyses
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import lightgbm as lgb # Prediction model
from utils import *
#import os

nltk.download('stopwords')
from nltk.corpus import stopwords

data_path = './data/'
countries = ['Mexico', 'Brazil', 'United States'] # Can be expanded to other countries

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alejandroleda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data Loading

### Training Set

In [12]:
rank = pd.read_pickle(data_path + 'netflixmovie_rank.pkl')
titles = pd.read_pickle(data_path + 'titles.pkl')
talent = pd.read_pickle(data_path + 'talent.pkl')
release_dates = pd.read_pickle(data_path + 'netflixmovie_release_dates.pkl')

In [5]:
main_df = pd.read_pickle(data_path + 'netflixmoviemain_df.pkl')
main_df

Unnamed: 0,country,jw_entity_id,rank,is_nflx_original,score,date,age_certification,object_type,original_release_year,original_title,...,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12
0,Argentina,tm1000599,,,1.0,2021-11-07,,movie,2021.0,A Última Floresta,...,,,,,,,,,,
1,Argentina,tm1000619,,,1.0,2022-05-07,,movie,2022.0,రాధే శ్యామ్,...,,,,,,,,,,
2,Argentina,tm1001097,,,1.0,2022-06-29,R,movie,2022.0,Beauty,...,romance,,,,,,,,,
3,Argentina,tm1001912,,,1.0,2022-03-02,,movie,2021.0,Trust,...,romance,,,,,,,,,
4,Argentina,tm1003034,,,1.0,2021-08-23,,movie,2021.0,The Witcher: Nightmare of the Wolf,...,scifi,animation,action,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25831,Venezuela,tm996762,,,1.0,2022-07-07,,movie,2022.0,మేజర్,...,,,,,,,,,,
25832,Venezuela,tm998033,,,1.0,2021-11-18,,movie,2021.0,டாக்டர்,...,comedy,crime,,,,,,,,
25833,Venezuela,tm998992,,,1.0,2022-09-07,PG,movie,2021.0,竜とそばかすの姫,...,fantasy,music,scifi,,,,,,,
25834,Venezuela,tm999817,,,1.0,2021-12-01,,movie,2021.0,白蛇 II：青蛇劫起,...,action,,,,,,,,,


In [8]:
talent = talent.merge(titles[['jw_entity_id', 'original_release_year', 'genre_1']], on='jw_entity_id', how='left')

# Create a feature called 'tenure' to measure the number of years between the earliest movie and the latest movie of each talent
talent['tenure'] = talent.groupby('person_id')['original_release_year'].transform(lambda x: x.max() - x.min())

talent

Unnamed: 0,role,character_name,person_id,name,title,jw_entity_id,original_release_year,genre_1
0,ACTOR,Janaki,68294,Meena,Avvai Shanmugi,tm110160,1996.0,drama
1,ACTOR,Joseph,145348,Nagesh,Avvai Shanmugi,tm110160,1996.0,drama
2,ACTOR,Bhai,45436,Nassar,Avvai Shanmugi,tm110160,1996.0,drama
3,ACTOR,Rathna,432833,Heera Rajgopal,Avvai Shanmugi,tm110160,1996.0,drama
4,ACTOR,Kousi,471253,Rani,Avvai Shanmugi,tm110160,1996.0,drama
...,...,...,...,...,...,...,...,...
959963,EDITOR,,15210,Larry Bock,Remember the Daze,tm73324,2008.0,comedy
959964,EXECUTIVE_PRODUCER,,618188,Kevin Loughery,Remember the Daze,tm73324,2008.0,comedy
959965,ORIGINAL_MUSIC_COMPOSER,,33249,Dustin O'Halloran,Remember the Daze,tm73324,2008.0,comedy
959966,PRODUCER,,17275,Matthew Rhodes,Remember the Daze,tm73324,2008.0,comedy


### Plot scoring

In [15]:
# create a pandas series called 'plots' with the index as the movie's 'jw_entity_id' and the value as the movie's 'short_description'
plots = pd.Series(main_df['short_description'].values, index=main_df['jw_entity_id'])

# drop  duplicate index vales from the plots series
plots = plots[~plots.index.duplicated(keep='first')]

def extract_keywords(plot_series):
    tr4w = TextRank4Keyword()
    keywords = pd.DataFrame()

    print('Extracting keywords from the plot of each of {} movies...'.format(len(plot_series)))

    # iterate through each movie's plot
    for i, row in tqdm(plot_series.iteritems()):
        try:
            tr4w.analyze(row, candidate_pos = ['NOUN'], window_size=4, lower=False)
        except TypeError:
            continue
        local_df = pd.DataFrame(tr4w.node_weight.items())
        local_df['jw_entity_id'] = i
        keywords = keywords.append(local_df, ignore_index=True)

    keywords.rename(columns={0:'keyword', 1:'node_weight'}, inplace=True)

    # Drop stopwords from the keywords dataframe
    keywords = keywords[~keywords['keyword'].isin(stopwords.words('english'))]

    # Drop punctuation from the keywords dataframe
    keywords = keywords[~keywords['keyword'].str.contains(r'[^\w\s]')]

    # Normalize node weights by dividing by the sum of all node weights for each movie
    keywords['node_weight_normalized'] = keywords.groupby('jw_entity_id')['node_weight'].transform(lambda x: x/x.sum())
    
    print('Total number of keywords extracted: {}'.format(len(keywords)))

    return keywords

def score_keywords(plot_series):
    keywords = extract_keywords(plot_series)

    keywords_score = keywords.merge(main_df[['country', 'jw_entity_id', 'score']], on='jw_entity_id', how='inner')

    # Weigh the node weights by the movie's score
    keywords_score['node_weight_scored'] = keywords_score['node_weight_normalized'] * keywords_score['score']

    # Create a dataframe called 'keywords_scored_by_keyword_and_country' with the 'node_weight_scored' column summed by 'keyword' and 'country'
    keywords_scored_by_keyword_and_country = keywords_score.groupby(['keyword', 'country'])['node_weight_scored'].sum().reset_index()

    # Merge the 'keywords_scored' dataframe with the 'keywords_scored_by_keyword_and_country' dataframe on 'keyword' and 'country'
    keywords_scored = keywords_score.merge(keywords_scored_by_keyword_and_country, on=['keyword', 'country'], suffixes=('', '_by_keyword_and_country'))

    return keywords_scored

def score_plots(plot_series, keywords_scored):
    
    # Create a dataframe called 'plot_score' with the 'node_weight_scored_by_keyword_and_country' column summed by 'jw_entity_id' and 'country'
    plots_scored = keywords_scored.groupby(['jw_entity_id', 'country'])['node_weight_scored_by_keyword_and_country'].sum().reset_index()
    
    return plots_scored

scored_keywords = score_keywords(plots)
scored_plots = score_plots(plots, scored_keywords)
scored_plots

4it [00:00, 38.39it/s]

Extracting keywords from the plot of each of 4130 movies...


4130it [00:49, 82.64it/s]


Total number of keywords extracted: 41148


Unnamed: 0,jw_entity_id,country,node_weight_scored_by_keyword_and_country
0,tm10,Hungary,22.590855
1,tm10,India,167.322700
2,tm10,South Africa,126.902840
3,tm1000037,France,70.888013
4,tm1000037,Greece,90.606945
...,...,...,...
25786,tm999927,South Africa,17.881518
25787,tm999927,Thailand,109.000463
25788,tm999927,United Kingdom,25.946371
25789,tm999927,Venezuela,43.813079


### Prediction Set

In [2]:
pred_set = pd.read_csv(data_path + 'project_form - movie.csv')

# make all column names lowercase
pred_set.columns = map(str.lower, pred_set.columns)

mask = ~pred_set['title'].isna()
pred_set['title'] = pred_set['title'].ffill()

# Create a 'pred_set_talent' dataframe that contains only the 'title', 'name', and 'role' columns
pred_set_talent = pred_set[['title', 'name', 'role']]

pred_set_talent.dropna(inplace=True)
pred_set_talent

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_set_talent.dropna(inplace=True)


Unnamed: 0,title,name,role
0,WAY DOWN,Liam Cunningham,ACTOR
1,WAY DOWN,Astrid Bergès-Frisbey,ACTOR
2,WAY DOWN,Freddie Highmore,ACTOR
3,WAY DOWN,Jaume Balagueró,DIRECTOR
4,WAY DOWN,Álvaro Augustín,PRODUCER
...,...,...,...
5561,MUSK,ALEX GIBNEY,DIRECTOR
5562,MUSK,Black Bear,PRODUCER
5563,MUSK,JIGSAW PRODUCTIONS,PRODUCER
5564,MUSK,CLOSER MEDIA,PRODUCER


In [3]:
pred_set = pred_set[mask].drop(columns=['name', 'role'])

# Drop rows with missing values for 'plot'
pred_set = pred_set.dropna(subset=['plot'])

pred_set

Unnamed: 0,title,plot,age_certification,genre_1,genre_2,genre_3,comentarios_vivi,budget,ask,sales,market,status
0,WAY DOWN,The Bank of Spain is like no other. An absolut...,PG-13,action,thriller,,,,,TF1,,
7,THE GOOD BOSS,It’s a sharp and nuanced dark comedy about the...,PG-13,comedy,drama,,"Es una comedia negra, por momentos bastante di...",3000000,250000,MK2,,
9,LOIS WAIN,Louis Wain: Unconventional while iconic. Candi...,R,drama,,,,,,,,
10,RIO,"Set against the exotic backdrop of Brazil, thi...",R,thriller,action,,,,,STUDIOCANAL,EFM 2022,
17,EMILY,Emily (Emma Mackey) wears a mask. The world te...,R,drama,,,Tipica historia tipo Pride and Prejudice. Se p...,Budget £8m,Asking para Latam: US$600k.,EMBANKMENT,Cannes 2022,Status Post-Production. Delivery Q1 2022
...,...,...,...,...,...,...,...,...,...,...,...,...
5540,CINNAMON,This darkly comedic heist thriller follows asp...,PG-13,thriller,comedy,,,,Ask: 75K,VILLAGE ROADSHOW,CANNES 2023,
5544,THE SALTED PATH,An honest and life-affirming true story of the...,R,drama,,,,,Ask: 450K,ROCKET SCIENCE,CANNES 2023,Pre prod. Shooting Date: 5th June 2023
5549,CONTROL,"Wallace Conway, a troubled doctor who increasi...",PG-13,thriller,drama,,"Lei las primeras 40 paginas, es excelente, atr...",,,STUDIOCANAL,CANNES 2023,
5553,CLIFFHANGER,Sylvester Stallone will reprise his character ...,PG-13,action,,,,,,ROCKET SCIENCE,CANNES 2023,Shooting Date: September 2023


In [31]:
# Create a pandas series called 'pred_set_plots' with the index as the movie's 'title' and the value as the movie's 'plot'
pred_set_plots = pd.Series(pred_set['plot'].values, index=pred_set['title'])
pred_set_keywords = extract_keywords(pred_set_plots)
pred_set_keywords.rename(columns={'jw_entity_id':'title'}, inplace=True)
pred_set_keywords

1it [00:00,  8.10it/s]

Extracting keywords from the plot of each of 905 movies...


905it [00:16, 54.71it/s]


Total number of keywords extracted: 15664


Unnamed: 0,keyword,node_weight,title,node_weight_normalized
0,bank,1.875803,WAY DOWN,0.065588
1,blueprints,0.150000,WAY DOWN,0.005245
2,maps,0.150000,WAY DOWN,0.005245
3,data,0.812003,WAY DOWN,0.028392
4,vault,1.895684,WAY DOWN,0.066283
...,...,...,...,...
15732,Guys,0.150000,MUSK,0.023810
15733,sister,0.955729,MUSK,0.151703
15734,company,1.144323,MUSK,0.181639
15735,MEDIA,1.144323,MUSK,0.181639


In [32]:
# Create a dataframe called 'pred_set_keywords_scored' which is the inner merge of 'pred_set_keywords' and 'scored_keywords' on 'keyword'
pred_set_keywords_scored = pred_set_keywords.drop('node_weight', axis=1).merge(scored_keywords[['keyword', 'country', 'node_weight_scored_by_keyword_and_country']], on=['keyword'], how='inner')
pred_set_keywords_scored['node_weight_scored_by_keyword_and_country'] = pred_set_keywords_scored['node_weight_scored_by_keyword_and_country'] * pred_set_keywords_scored['node_weight_normalized']
pred_set_keywords_scored.drop(columns=['node_weight_normalized'], inplace=True)
pred_set_keywords_scored.drop_duplicates(inplace=True)
pred_set_keywords_scored

Unnamed: 0,keyword,title,country,node_weight_scored_by_keyword_and_country
0,bank,WAY DOWN,Argentina,0.090901
13,bank,WAY DOWN,Brazil,0.152433
25,bank,WAY DOWN,Chile,0.076533
36,bank,WAY DOWN,Colombia,0.073204
47,bank,WAY DOWN,Czech Republic,0.034355
...,...,...,...,...
6076455,Blood,MUSK,Czech Republic,0.003968
6076456,Blood,MUSK,Hungary,0.003968
6076457,Blood,MUSK,Indonesia,0.015873
6076458,Blood,MUSK,Romania,0.194444


In [33]:
# Create a dataframe called 'pred_set_plots_scored' which is the sum of 'node_weight_scored_by_keyword_and_country' grouped by 'title' and 'country'
pred_set_plots_scored = pred_set_keywords_scored.groupby(['title', 'country'])['node_weight_scored_by_keyword_and_country'].sum().reset_index()
pred_set_plots_scored

Unnamed: 0,title,country,node_weight_scored_by_keyword_and_country
0,10 Lives,Argentina,4.481022
1,10 Lives,Austria,4.577243
2,10 Lives,Belgium,3.445158
3,10 Lives,Brazil,3.892160
4,10 Lives,Canada,3.926544
...,...,...,...
32369,ZOYA,Thailand,18.053885
32370,ZOYA,Turkey,0.496701
32371,ZOYA,United Kingdom,3.963960
32372,ZOYA,United States,5.337483


In [34]:
country = countries[0]
local_df = main_df[main_df['country'] == country]
local_df

Unnamed: 0,country,jw_entity_id,rank,is_nflx_original,score,date,age_certification,object_type,original_release_year,original_title,...,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12
14809,Mexico,tm1000037,,,1.0,2021-09-23,R,movie,2021.0,Je suis Karl,...,european,romance,,,,,,,,
14810,Mexico,tm1000599,,,1.0,2021-11-07,,movie,2021.0,A Última Floresta,...,,,,,,,,,,
14811,Mexico,tm1000619,,,1.0,2022-05-06,,movie,2022.0,రాధే శ్యామ్,...,,,,,,,,,,
14812,Mexico,tm1001097,,,1.0,2022-06-29,R,movie,2022.0,Beauty,...,romance,,,,,,,,,
14813,Mexico,tm1002815,,,1.0,2021-09-15,,movie,2021.0,Nightbooks,...,fantasy,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15785,Mexico,tm996762,,,1.0,2022-07-07,,movie,2022.0,మేజర్,...,,,,,,,,,,
15786,Mexico,tm998188,,,1.0,2021-12-01,,movie,2021.0,Donde caben dos,...,,,,,,,,,,
15787,Mexico,tm998992,,,1.0,2022-09-07,PG,movie,2021.0,竜とそばかすの姫,...,fantasy,music,scifi,,,,,,,
15788,Mexico,tm999817,,,1.0,2021-12-01,,movie,2021.0,白蛇 II：青蛇劫起,...,action,,,,,,,,,


## Utility functions

In [2]:
def prep_pred_set(pred_main, pred_talent, pred_keywords, kw_by_country, country, cols2measure):
    local_pred_keywords = pred_keywords.merge(kw_by_country[kw_by_country['country'] == country])
    local_pred_plot_score = local_pred_keywords.groupby('jw_entity_id')[['weighted_score']].sum().reset_index()
    local_pred_main = pred_main.merge(local_pred_plot_score, left_on='title', right_on='jw_entity_id')
    local_pred_main = local_pred_main.drop('title', axis=1)
    local_pred_main.rename(columns={'weighted_score': 'plot_score'}, inplace=True)
    local_pred_main = local_pred_main.set_index('jw_entity_id')
    local_pred_talent_hist = pred_talent.merge(local_talent, on=['name', 'role'])
    if not local_pred_talent_hist.empty:
        local_pred_talent_hist = local_pred_talent_hist.groupby(['title_x', 'role'])[cols2measure].agg(['sum', 'mean'])
        local_pred_talent_hist = local_pred_talent_hist.unstack().fillna(0)
        local_pred_main = local_pred_main.join(local_pred_talent_hist)
    local_pred_main['runtime'] = np.round(local_df['runtime'].mean())
    return local_pred_main

