## Data wrangling

In [63]:
from google.colab import drive
drive.mount('/content/drive')

import os

path = 'Colab Notebooks/NLP/Project'

os.chdir(f'/content/drive/MyDrive/{path}')
os.getcwd()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'/content/drive/MyDrive/Colab Notebooks/NLP/Project'

In [64]:
dataset = 'cornell-movie-dialogs-small'

These are the different files found in the dataset, let's see what's inside of them

In [65]:
with open(f'{dataset}/movie_lines.txt', encoding='utf-8') as f:
    lines = f.readlines()

with open(f'{dataset}/movie_conversations.txt', encoding='utf-8') as f:
    conversations = f.readlines()

with open(f'{dataset}/movie_titles_metadata.txt', encoding='latin-1') as f:
    titles = f.readlines()

with open(f'{dataset}/movie_characters_metadata.txt', encoding='latin-1') as f:
    characters = f.readlines()

In [66]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The lines and conversation files that are conected with an ID of the line in the conversation.

> According to the dataset documentation these file contains
* 220,579 conversational exchanges between 10,292 pairs of movie characters
* involves 9,035 characters from 617 movies
* in total 304,713 utterances

Also on the titles file we can see

> Movie metadata included:
* genres
* release year
* IMDB rating
* number of IMDB votes
* IMDB rating

And this information for the characters file

> Character metadata included:
* gender (for 3,774 characters)
* position on movie credits (3,321 characters)


For easier visualization, we'll load the Loading them into a pandas dataframe, starting with the lines.


As we saw before, each column of the dataset is splitted by the string `+++$+++` then we'll set that as a parameter for the split function. Also we'll delete the last `\n` symbol from the line

In [67]:
import re

In [68]:
def clean_text(text): #adapted from https://github.com/REDFOX1899/Chatbot/blob/master/Chatbot.py
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)

    # from some common specific mistakes found in this dataset
    text = re.sub(r"youíre", "you are", text)
    text = re.sub(r"óó", "", text)

    return text

In [69]:
import pandas as pd

In [70]:
# create dataframe with lines
df_lines = pd.DataFrame({'line_text': lines})

# split into columns
df_lines = df_lines['line_text'].str.split(r' \+\+\+\$\+\+\+ ', expand=True)
df_lines.columns = ['line_ID', 'speaker_ID', 'movie','speaker','text']

# delete new line character and other blank character
df_lines = df_lines.applymap(lambda x: x.rstrip() if isinstance(x, str) else x)
df_lines['text'] = df_lines['text'].str.rstrip('\n')

# Add column clean text
df_lines['clean_text'] = df_lines['text'].apply(lambda x: clean_text(x))

#might be needed?
# # add column speaker + text
# df_lines['line'] = df_lines['speaker'] + ": " + df_lines['clean_text']

df_lines.head(5)

Unnamed: 0,line_ID,speaker_ID,movie,speaker,text,clean_text
0,L1045,u0,m0,BIANCA,They do not!,they do not
1,L1044,u2,m0,CAMERON,They do to!,they do to
2,L985,u0,m0,BIANCA,I hope so.,i hope so
3,L984,u2,m0,CAMERON,She okay?,she okay
4,L925,u0,m0,BIANCA,Let's go.,let's go


Now let's do the same thing with all the other txt files

In [71]:
import ast

In [72]:
# create dataframe with conversations
df_conv = pd.DataFrame({'conv': conversations})

# split into columns
df_conv = df_conv['conv'].str.split(r' \+\+\+\$\+\+\+ ', expand=True)
df_conv.columns = ['speaker1_ID', 'speaker2_ID', 'movie_ID','lines_list']

# delete new line character
df_conv['lines_list'] = df_conv['lines_list'].str.rstrip('\n')

# set lines_list to list type
df_conv['lines_list'] = df_conv['lines_list'].apply(ast.literal_eval)

df_conv.head(5)

Unnamed: 0,speaker1_ID,speaker2_ID,movie_ID,lines_list
0,u0,u2,m0,"[L194, L195, L196, L197]"
1,u0,u2,m0,"[L198, L199]"
2,u0,u2,m0,"[L200, L201, L202, L203]"
3,u0,u2,m0,"[L204, L205, L206]"
4,u0,u2,m0,"[L207, L208]"


In [73]:
# create dataframe with conversations
df_title = pd.DataFrame({'title': titles})

# split into columns
df_title = df_title['title'].str.split(r' \+\+\+\$\+\+\+ ', expand=True)
df_title.columns = ['movie_ID','title','year','IMBD_rating','IMBD_votes','genres']

# cast types to what they are
df_title['IMBD_rating'] = df_title['IMBD_rating'].astype(float)
df_title['IMBD_votes'] = df_title['IMBD_votes'].astype(int)
# Clean 'year' column using regex (for cases like ' 1989/I ')
df_title['year'] = df_title['year'].apply(lambda x: re.sub(r'\D', '', x))  # \D means "no digit"
df_title['year'] = df_title['year'].astype(int)

# delete new line character
df_title['genres'] = df_title['genres'].str.rstrip('\n')

# set genres_list to list type
df_title['genres'] = df_title['genres'].apply(ast.literal_eval)
df_title.head(5)

Unnamed: 0,movie_ID,title,year,IMBD_rating,IMBD_votes,genres
0,m0,10 things i hate about you,1999,6.9,62847,"[comedy, romance]"
1,m1,1492: conquest of paradise,1992,6.2,10421,"[adventure, biography, drama, history]"
2,m2,15 minutes,2001,6.1,25854,"[action, crime, drama, thriller]"
3,m3,2001: a space odyssey,1968,8.4,163227,"[adventure, mystery, sci-fi]"
4,m4,48 hrs.,1982,6.9,22289,"[action, comedy, crime, drama, thriller]"


In [74]:
# create dataframe with conversations
df_chars = pd.DataFrame({'characters': characters})

# split into columns
df_chars = df_chars['characters'].str.split(r' \+\+\+\$\+\+\+ ', expand=True)
df_chars.columns = ['character_ID','name','movie_ID','movie_title','gender','credits_pos']

# delete new line character
df_chars['credits_pos'] = df_chars['credits_pos'].str.rstrip('\n')

# cast credits_pos to int (-1 if unknown)
df_chars.loc[df_chars['credits_pos'] == '?', 'credits_pos'] = -1
df_chars['credits_pos'] = df_chars['credits_pos'].astype(int)  # max(df_chars['credits_pos']) = 1000

df_chars.head(5)

Unnamed: 0,character_ID,name,movie_ID,movie_title,gender,credits_pos
0,u0,BIANCA,m0,10 things i hate about you,f,4
1,u1,BRUCE,m0,10 things i hate about you,?,-1
2,u2,CAMERON,m0,10 things i hate about you,m,3
3,u3,CHASTITY,m0,10 things i hate about you,?,-1
4,u4,JOEY,m0,10 things i hate about you,m,6


For easier handle of the conversations and lines we'll join them together in one dataframe

In [75]:
# Create a guide index
df_conv['index'] = df_conv.index
# Expand list into all sub item lines
expanded_lines = df_conv.explode('lines_list')
# Reset index of expanded df
expanded_lines.reset_index(drop=True, inplace=True)
# merge the line with line ids
merged_df = pd.merge(
    expanded_lines,
    df_lines,
    left_on='lines_list',
    right_on='line_ID',
    how='inner'
)

# Select relevant columns and group by the guide index
merged_df = merged_df[['speaker1_ID', 'speaker2_ID', 'movie_ID', 'lines_list', 'index', 'clean_text']]
merged_df = merged_df.groupby(['index','speaker1_ID', 'speaker2_ID', 'movie_ID'])['clean_text'].apply(list).reset_index()
merged_df = merged_df.rename(columns={'clean_text': 'dialog'})

# convert dialog to string, not list
merged_df['dialog'] = merged_df['dialog'].apply(lambda x: ';'.join(x))
dialog_df = merged_df.drop(['index'], axis=1)

dialog_df

Unnamed: 0,speaker1_ID,speaker2_ID,movie_ID,dialog
0,u0,u2,m0,can we make this quick roxanne korrine and an...
1,u0,u2,m0,you are asking me out that is so cute what is...
2,u0,u2,m0,no no it's my fault we did not have a proper ...
3,u0,u2,m0,why;unsolved mystery she used to be really po...
4,u0,u2,m0,gosh if only we could find kat a boyfriend;let...
...,...,...,...,...
83092,u9028,u9031,m616,do you think she might be interested in someo...
83093,u9028,u9031,m616,choose your targets men that is right watch th...
83094,u9030,u9034,m616,colonel durnford william vereker i hear you h...
83095,u9030,u9034,m616,your orders mr vereker;i am to take the sikali...


In [76]:
dialog_df['dialog'][501]

'i can tell from the tone of your voice dave that you are upset why do not you take a stress pill and get some rest;hal i am in command of this ship i order you to release the manual hibernation control;i am sorry dave but in accordance with subroutine c15324 quote when the crew are dead or incapacitated the computer must assume control unquote i must therefore override your authority now since you are not in any condition to intel ligently exercise it;hal unless you follow my instructions i shall be forced to disconnect you'

In [77]:
dialog_df['movie_ID'][501]

'm3'

### Preparing dataframes

For the character metadata prediction we only need the line and the speaker and not the whole dialog. Then we'll predict the gender and the position on the movie credits based on one line. So let's put in a single dataframe line and character metadata

In [78]:
character_metadata = pd.merge(
    df_lines,
    df_chars,
    left_on='speaker_ID',
    right_on='character_ID',
    how='inner'

)

# Select relevant columns and group by the guide index
character_metadata = character_metadata[['line_ID', 'speaker_ID', 'movie_ID', 'speaker', 'clean_text', 'movie_title','gender','credits_pos']]
character_metadata = character_metadata.rename(columns={'clean_text': 'line'})

character_metadata

Unnamed: 0,line_ID,speaker_ID,movie_ID,speaker,line,movie_title,gender,credits_pos
0,L1045,u0,m0,BIANCA,they do not,10 things i hate about you,f,4
1,L985,u0,m0,BIANCA,i hope so,10 things i hate about you,f,4
2,L925,u0,m0,BIANCA,let's go,10 things i hate about you,f,4
3,L872,u0,m0,BIANCA,okay you are gonna need to learn how to lie,10 things i hate about you,f,4
4,L870,u0,m0,BIANCA,i am kidding you know how sometimes you just ...,10 things i hate about you,f,4
...,...,...,...,...,...,...,...,...
304708,L666522,u9034,m616,VEREKER,so far only their scouts but we have had repor...,zulu dawn,?,-1
304709,L666520,u9034,m616,VEREKER,well i assure you sir i have no desire to crea...,zulu dawn,?,-1
304710,L666372,u9034,m616,VEREKER,i think chelmsford wants a good man on the bor...,zulu dawn,?,-1
304711,L666370,u9034,m616,VEREKER,i am to take the sikali with the main column t...,zulu dawn,?,-1


And for the movie metadata prediction we will need the whole dialog. So, let's put the metadata into the dialogs dataframe to make things easier.

In [79]:
movie_metadata = pd.merge(
    dialog_df,
    df_title,
    left_on='movie_ID',
    right_on='movie_ID',
    how='inner'
)

movie_metadata

Unnamed: 0,speaker1_ID,speaker2_ID,movie_ID,dialog,title,year,IMBD_rating,IMBD_votes,genres
0,u0,u2,m0,can we make this quick roxanne korrine and an...,10 things i hate about you,1999,6.9,62847,"[comedy, romance]"
1,u0,u2,m0,you are asking me out that is so cute what is...,10 things i hate about you,1999,6.9,62847,"[comedy, romance]"
2,u0,u2,m0,no no it's my fault we did not have a proper ...,10 things i hate about you,1999,6.9,62847,"[comedy, romance]"
3,u0,u2,m0,why;unsolved mystery she used to be really po...,10 things i hate about you,1999,6.9,62847,"[comedy, romance]"
4,u0,u2,m0,gosh if only we could find kat a boyfriend;let...,10 things i hate about you,1999,6.9,62847,"[comedy, romance]"
...,...,...,...,...,...,...,...,...,...
83092,u9028,u9031,m616,do you think she might be interested in someo...,zulu dawn,1979,6.4,1911,"[action, adventure, drama, history, war]"
83093,u9028,u9031,m616,choose your targets men that is right watch th...,zulu dawn,1979,6.4,1911,"[action, adventure, drama, history, war]"
83094,u9030,u9034,m616,colonel durnford william vereker i hear you h...,zulu dawn,1979,6.4,1911,"[action, adventure, drama, history, war]"
83095,u9030,u9034,m616,your orders mr vereker;i am to take the sikali...,zulu dawn,1979,6.4,1911,"[action, adventure, drama, history, war]"


## Vectorizing

In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [81]:
vec_lines = TfidfVectorizer(max_df=0.8, min_df=5, stop_words='english', max_features=1000)

### Vectorizing lines

In [82]:
lines_X = vec_lines.fit_transform(character_metadata['line'])
tfidf_df = pd.DataFrame(lines_X.toarray(), columns=vec_lines.get_feature_names_out())
tfidf_df

Unnamed: 0,able,absolutely,accept,accident,account,act,acting,action,actually,address,...,wrong,wrote,ya,yeah,year,years,yes,yesterday,york,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304708,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
304709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
304710,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
304711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Vectorizing dialogs

In [83]:
vec_dial = TfidfVectorizer(max_df=0.8, min_df=5, stop_words='english', max_features=1000)

Using the vectorizer (computed in 'otherTasks' notebook) we transform de data to have the dialog in a way we can input it to the model

In [84]:
dialog_X = vec_dial.fit_transform(movie_metadata['dialog'])
tfidf_df = pd.DataFrame(dialog_X.toarray(), columns=vec_dial.get_feature_names_out())
tfidf_df

Unnamed: 0,able,absolutely,accept,accident,account,act,acting,actually,address,admit,...,wrong,wrote,ya,yeah,year,years,yes,yesterday,york,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83092,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
83093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
83094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.21258,0.0,0.0,0.0
83095,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0


## Model Creation

### Character metadata models

**Input:** Line

**Output:**
Character metadata:

* gender
* position on movie *credits*

#### Gender model

The gender of a character is classified as `F`, `M` or `?`. As it has 3 classes, we must use a classifier model.

In [85]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [86]:
sample_size = 300  # Test size - to delete
y_gender = character_metadata.iloc[0:sample_size]['gender']  # Test size y - to delete
# y_gender = character_metadata['gender']  # Real size y - to uncomment
small_vec_X = vec_lines.fit_transform(character_metadata.iloc[0:sample_size]['line'])  # Test size X - to delete
# X_in_use = lines_X  # Real size X - to uncomment
X_in_use = small_vec_X  # Test X - to delete

X_train, X_test, y_train, y_test = train_test_split(X_in_use, y_gender, test_size=0.2, random_state=42)

# Using RandomForestClassifier for classification
gender_model = RandomForestClassifier()
gender_model.fit(X_train, y_train)

accuracy = gender_model.score(X_test, y_test)
print('Gender Prediction Accuracy:', accuracy)

Gender Prediction Accuracy: 0.5333333333333333


#### Credits position model

The position in the credits can be seen as continuous or as a categorical variable, since it's a number but it can also represent how important the character was in the movie. If characters are often ranked as "lead", "supporting", and "minor", it makes sense to classify them into categories. So we will try both Regression and Classification and for the last one classes are the position in the credits (`-1` for unknown, `1` for main character and so on, as it gets bigger the less important the character).

In [87]:
from sklearn.ensemble import RandomForestRegressor

In [88]:
sample_size = 300 # test - to delete
y_pos = character_metadata.iloc[0:sample_size]['credits_pos'] # test - to delete
# y_pos = character_metadata['credits_pos'] # real size y - to uncomment
small_vec_X = vectorizer.fit_transform(character_metadata.iloc[0:sample_size]['line']) # test - to delete
# X_in_use = lines_X # real size X - to uncomment
X_in_use = small_vec_X # test X - to delete

X_train, X_test, y_train, y_test = train_test_split(X_in_use, y_pos, test_size=0.2, random_state=42)

# Using RandomForestRegressor for regression
credits_pos_model_reg = RandomForestRegressor()
credits_pos_model_reg.fit(X_train, y_train)

accuracy = credits_pos_model_reg.score(X_test, y_test)
print('Credits position Accuracy:', accuracy)

Credits position Accuracy: -0.21733712818186546


In [89]:
sample_size = 300 # test - to delete
y_pos = character_metadata.iloc[0:sample_size]['credits_pos'] # test - to delete
# y_pos = character_metadata['credits_pos'] # real size y - to uncomment
small_vec_X = vectorizer.fit_transform(character_metadata.iloc[0:sample_size]['line']) # test - to delete
# X_in_use = lines_X # real size X - to uncomment
X_in_use = small_vec_X # test X - to delete

X_train, X_test, y_train, y_test = train_test_split(X_in_use, y_pos, test_size=0.2, random_state=42)

# Using RandomForestClassifier for classification
credits_pos_model_clas = RandomForestClassifier()
credits_pos_model_clas.fit(X_train, y_train)

accuracy = credits_pos_model_clas.score(X_test, y_test)
print('Credits position Accuracy:', accuracy)

Credits position Accuracy: 0.36666666666666664


The score method returns the coefficient of determination R^2
The values that can take are as follows:

* R^2=1 : Indicates that the model perfectly explains the variance in the test data.
* R^2>0 : Indicates that the model has some predictive power, capturing part of the variance in the data.
* R^2=0 : Indicates that the model explains none of the variance, equivalent to using the mean of the observed values.
* R^2<0 : Indicates that the model is worse than a simple mean, meaning the model is failing to capture the variance in the data.

It looks like it has a better accuracy if the credit position is treated as a clasification. So we'll use that one.

### Movie metadata models

**Input:** Dialog

**Output:** Movie metadata:

* genres
* release year
* IMDB rating
* number of IMDB votes

We will create a model for each metadata field

#### Release year model

In [90]:
sample_size = 300 # test - to delete
y_year = movie_metadata.iloc[0:sample_size]['year']  # test - to delete
#y_year = movie_metadata['year']  # real size y - to uncomment
small_vec_X = vec_dial.fit_transform(movie_metadata.iloc[0:sample_size]['dialog']) # test - to delete
# X_in_use = lines_X # real size X - to uncomment
X_in_use = small_vec_X # test X - to delete

X_train, X_test, y_train, y_test = train_test_split(small_vec_X, y_year, test_size=0.2, random_state=42)

release_year_model = RandomForestRegressor()
release_year_model.fit(X_train, y_train)

accuracy = release_year_model.score(X_test, y_test)
print('Release Year Prediction Accuracy:', accuracy)

Release Year Prediction Accuracy: 0.2689944802819675


#### IMDB rating model

In [93]:
sample_size = 300 # test - to delete
y_rate = movie_metadata.iloc[0:sample_size]['IMBD_rating']  # test - to delete
#y_rate = movie_metadata['IMBD_rating']  # real size y - to uncomment
small_vec_X = vectorizer.fit_transform(movie_metadata.iloc[0:sample_size]['dialog']) # test - to delete
# X_in_use = lines_X # real size X - to uncomment
X_in_use = small_vec_X # test X - to delete

X_train, X_test, y_train, y_test = train_test_split(small_vec_X, y_rate, test_size=0.2, random_state=42)

imdb_rating_model = RandomForestRegressor()
imdb_rating_model.fit(X_train, y_train)

accuracy = imdb_rating_model.score(X_test, y_test)
print('IMDB rating Prediction Accuracy:', accuracy)

IMDB rating Prediction Accuracy: 0.27930733642047323


#### Number of votes model

In [94]:
sample_size = 300 # test - to delete
y_votes = movie_metadata.iloc[0:sample_size]['IMBD_votes']  # test - to delete
#y_votes = movie_metadata['IMBD_votes']  # real size y - to uncomment
small_vec_X = vec_dial.fit_transform(movie_metadata.iloc[0:sample_size]['dialog']) # test - to delete
# X_in_use = lines_X # real size X - to uncomment
X_in_use = small_vec_X # test X - to delete

X_train, X_test, y_train, y_test = train_test_split(small_vec_X, y_votes, test_size=0.2, random_state=42)

votes_model = RandomForestRegressor()
votes_model.fit(X_train, y_train)

accuracy = votes_model.score(X_test, y_test)
print('IMDB rating Prediction Accuracy:', accuracy)

IMDB rating Prediction Accuracy: 0.28853147062131657


#### Genres model

This is particular case, since what we are trying to predict is a list of elements (genres). This problem is called MultiLabel classification, that is why we'll use a MultiLabelBinerizer

In [121]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

In [123]:
sample_size = 300 # test - to delete

# Binarize the list of genres
mlb = MultiLabelBinarizer()
y_genres = mlb.fit_transform(movie_metadata.iloc[0:sample_size]['genres']) # test - to delete
#y_genres = mlb.fit_transform(movie_metadata['genres'])  # real size y - to uncomment
small_vec_X = vec_dial.fit_transform(movie_metadata.iloc[0:sample_size]['dialog']) # test - to delete
# X_in_use = lines_X # real size X - to uncomment
X_in_use = small_vec_X # test X - to delete

X_train, X_test, y_train, y_test = train_test_split(small_vec_X, y_genres, test_size=0.2, random_state=42)

genres_model = OneVsRestClassifier(RandomForestClassifier())
genres_model.fit(X_train, y_train)

accuracy = genres_model.score(X_test, y_test)
print('Genre Prediction Accuracy:', accuracy)

Genre Prediction Accuracy: 0.7333333333333333


## Meta data prediction task


```
# Esto tiene formato de código
```



### Prediction of character metadata for an specific line.

In [112]:
def predict_character_metadata(line):
    line_vector = vec_lines.transform([line])

    gender = gender_model.predict(line_vector)
    credits_pos = credits_pos_model_clas.predict(line_vector)
    # credits_pos_model_clas has a higher accuracy than credits_pos_model_reg

    return {
        'gender': gender[0],
        'credits_pos': credits_pos[0]
    }


### Prediction of movie metadata for specific dialog.

In [133]:
def predict_movie_metadata(dialogue):
    dialogue_vector = vec_dial.transform([dialogue])

    genres = mlb.inverse_transform(genres_model.predict(dialogue_vector))
    release_year = release_year_model.predict(dialogue_vector)
    imdb_rating = imdb_rating_model.predict(dialogue_vector)
    number_of_votes = votes_model.predict(dialogue_vector)

    return {
        'genres': list(genres[0]),
        'year': int(release_year[0]),
        'IMBD_rating': round(imdb_rating[0],1),
        'IMBD_votes': int(number_of_votes[0])
    }


## Testing

### Testing with real data from dataset

In [None]:
import numpy as np

#### Character

In [117]:
random_index = np.random.randint(len(character_metadata))
test1 = character_metadata.loc[random_index]['line']
predictions = predict_character_metadata(test1)

print(f'Index: {random_index}')
print(f'Line: {test1}')
print('Results:')
print("-------------------------------------------------")
for i in predictions:
    print(f'Real {i}:     \t {character_metadata.loc[random_index][i]}')
    print(f'Predicted {i}:\t {predictions[i]}')
    print()

Index: 124511
Line: yeah bring it on
-------------------------------------------------
Results:
Real gender:     	 m
Predicted gender:	 m

Real credits_pos:     	 1
Predicted credits_pos:	 3



#### Movie

In [134]:
random_index = np.random.randint(len(movie_metadata))
test2 = movie_metadata.loc[random_index]['dialog']
predictions = predict_movie_metadata(test2)

print(f'Index: {random_index}')
print(f'Dialog: {test2}')
print('Results:')
print("-------------------------------------------------")
for i in predictions:
    print(f'Real {i}:\t {movie_metadata.loc[random_index][i]}')
    print(f'Predicted {i}:\t {predictions[i]}')
    print()

Index: 44412
Dialog: jennifer;no it's mom;mom
Results:
-------------------------------------------------
Real genres:	 ['sci-fi', 'horror']
Predicted genres:	 ['comedy', 'romance']

Real year:	 1987
Predicted year:	 1996

Real IMBD_rating:	 4.5
Predicted IMBD_rating:	 6.6

Real IMBD_votes:	 627
Predicted IMBD_votes:	 41190



### Testing with fictional data

In [118]:
predict_character_metadata("Hey, I think I'm in love with you")

{'gender': 'm', 'credits_pos': 3}

In [141]:
predict_character_metadata("Do you want to play football?")

{'gender': 'm', 'credits_pos': 6}

In [131]:
predict_movie_metadata("Hi; Hey how are you?; I'm really scared, I think there is some on in my house")

{'genres': [('comedy', 'romance')],
 'year': 1998,
 'IMBD_rating': 6.9,
 'IMBD_votes': 62322}

In [142]:
predict_movie_metadata("I want 100000 dollars; How will you get all that money?; I'll rob a bank")

{'genres': ['drama'], 'year': 1996, 'IMBD_rating': 6.3, 'IMBD_votes': 31523}