## Data wrangling

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os

path = 'Colab Notebooks/NLP/Project'

os.chdir(f'/content/drive/MyDrive/{path}')
os.getcwd()


Mounted at /content/drive


'/content/drive/MyDrive/Colab Notebooks/NLP/Project'

In [2]:
dataset = 'cornell-movie-dialogs-small'

These are the different files found in the dataset, let's see what's inside of them

In [3]:
with open(f'{dataset}/movie_lines.txt', encoding='utf-8') as f:
    lines = f.readlines()

with open(f'{dataset}/movie_conversations.txt', encoding='utf-8') as f:
    conversations = f.readlines()

with open(f'{dataset}/movie_titles_metadata.txt', encoding='latin-1') as f:
    titles = f.readlines()

with open(f'{dataset}/movie_characters_metadata.txt', encoding='latin-1') as f:
    characters = f.readlines()

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The lines and conversation files that are conected with an ID of the line in the conversation.

> According to the dataset documentation these file contains
* 220,579 conversational exchanges between 10,292 pairs of movie characters
* involves 9,035 characters from 617 movies
* in total 304,713 utterances

Also on the titles file we can see

> Movie metadata included:
* genres
* release year
* IMDB rating
* number of IMDB votes
* IMDB rating

And this information for the characters file

> Character metadata included:
* gender (for 3,774 characters)
* position on movie credits (3,321 characters)


For easier visualization, we'll load the Loading them into a pandas dataframe, starting with the lines.


As we saw before, each column of the dataset is splitted by the string `+++$+++` then we'll set that as a parameter for the split function. Also we'll delete the last `\n` symbol from the line

In [5]:
import re

In [6]:
def clean_text(text): #adapted from https://github.com/REDFOX1899/Chatbot/blob/master/Chatbot.py
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)

    # from some common specific mistakes found in this dataset
    text = re.sub(r"youíre", "you are", text)
    text = re.sub(r"óó", "", text)

    return text

In [7]:
import pandas as pd

In [8]:
# create dataframe with lines
df_lines = pd.DataFrame({'line_text': lines})

# split into columns
df_lines = df_lines['line_text'].str.split(r' \+\+\+\$\+\+\+ ', expand=True)
df_lines.columns = ['line_ID', 'speaker_ID', 'movie','speaker','text']

# delete new line character and other blank character
df_lines = df_lines.applymap(lambda x: x.rstrip() if isinstance(x, str) else x)
df_lines['text'] = df_lines['text'].str.rstrip('\n')

# Add column clean text
df_lines['clean_text'] = df_lines['text'].apply(lambda x: clean_text(x))

#might be needed?
# # add column speaker + text
# df_lines['line'] = df_lines['speaker'] + ": " + df_lines['clean_text']

df_lines.head(5)

Unnamed: 0,line_ID,speaker_ID,movie,speaker,text,clean_text
0,L1045,u0,m0,BIANCA,They do not!,they do not
1,L1044,u2,m0,CAMERON,They do to!,they do to
2,L985,u0,m0,BIANCA,I hope so.,i hope so
3,L984,u2,m0,CAMERON,She okay?,she okay
4,L925,u0,m0,BIANCA,Let's go.,let's go


Now let's do the same thing with all the other txt files

In [9]:
import ast

In [10]:
# create dataframe with conversations
df_conv = pd.DataFrame({'conv': conversations})

# split into columns
df_conv = df_conv['conv'].str.split(r' \+\+\+\$\+\+\+ ', expand=True)
df_conv.columns = ['speaker1_ID', 'speaker2_ID', 'movie_ID','lines_list']

# delete new line character
df_conv['lines_list'] = df_conv['lines_list'].str.rstrip('\n')

# set lines_list to list type
df_conv['lines_list'] = df_conv['lines_list'].apply(ast.literal_eval)

df_conv.head(5)

Unnamed: 0,speaker1_ID,speaker2_ID,movie_ID,lines_list
0,u0,u2,m0,"[L194, L195, L196, L197]"
1,u0,u2,m0,"[L198, L199]"
2,u0,u2,m0,"[L200, L201, L202, L203]"
3,u0,u2,m0,"[L204, L205, L206]"
4,u0,u2,m0,"[L207, L208]"


In [11]:
# create dataframe with conversations
df_title = pd.DataFrame({'title': titles})

# split into columns
df_title = df_title['title'].str.split(r' \+\+\+\$\+\+\+ ', expand=True)
df_title.columns = ['movie_ID','title','year','IMBD_rating','IMBD_votes','genres']

# cast types to what they are
df_title['IMBD_rating'] = df_title['IMBD_rating'].astype(float)
df_title['IMBD_votes'] = df_title['IMBD_votes'].astype(int)
# Clean 'year' column using regex (for cases like ' 1989/I ')
df_title['year'] = df_title['year'].apply(lambda x: re.sub(r'\D', '', x))  # \D means "no digit"
df_title['year'] = df_title['year'].astype(int)

# delete new line character
df_title['genres'] = df_title['genres'].str.rstrip('\n')

# set genres_list to list type
df_title['genres'] = df_title['genres'].apply(ast.literal_eval)
df_title.head(5)

Unnamed: 0,movie_ID,title,year,IMBD_rating,IMBD_votes,genres
0,m0,10 things i hate about you,1999,6.9,62847,"[comedy, romance]"
1,m1,1492: conquest of paradise,1992,6.2,10421,"[adventure, biography, drama, history]"
2,m2,15 minutes,2001,6.1,25854,"[action, crime, drama, thriller]"
3,m3,2001: a space odyssey,1968,8.4,163227,"[adventure, mystery, sci-fi]"
4,m4,48 hrs.,1982,6.9,22289,"[action, comedy, crime, drama, thriller]"


In [12]:
# create dataframe with conversations
df_chars = pd.DataFrame({'characters': characters})

# split into columns
df_chars = df_chars['characters'].str.split(r' \+\+\+\$\+\+\+ ', expand=True)
df_chars.columns = ['character_ID','name','movie_ID','movie_title','gender','credits_pos']

# delete new line character
df_chars['credits_pos'] = df_chars['credits_pos'].str.rstrip('\n')

# cast credits_pos to int (-1 if unknown)
df_chars.loc[df_chars['credits_pos'] == '?', 'credits_pos'] = -1
df_chars['credits_pos'] = df_chars['credits_pos'].astype(int)  # max(df_chars['credits_pos']) = 1000

df_chars.head(5)

Unnamed: 0,character_ID,name,movie_ID,movie_title,gender,credits_pos
0,u0,BIANCA,m0,10 things i hate about you,f,4
1,u1,BRUCE,m0,10 things i hate about you,?,-1
2,u2,CAMERON,m0,10 things i hate about you,m,3
3,u3,CHASTITY,m0,10 things i hate about you,?,-1
4,u4,JOEY,m0,10 things i hate about you,m,6


For easier handle of the conversations and lines we'll join them together in one dataframe

In [13]:
# Create a guide index
df_conv['index'] = df_conv.index
# Expand list into all sub item lines
expanded_lines = df_conv.explode('lines_list')
# Reset index of expanded df
expanded_lines.reset_index(drop=True, inplace=True)
# merge the line with line ids
merged_df = pd.merge(
    expanded_lines,
    df_lines,
    left_on='lines_list',
    right_on='line_ID',
    how='inner'
)

# Select relevant columns and group by the guide index
merged_df = merged_df[['speaker1_ID', 'speaker2_ID', 'movie_ID', 'lines_list', 'index', 'clean_text']]
merged_df = merged_df.groupby(['index','speaker1_ID', 'speaker2_ID', 'movie_ID'])['clean_text'].apply(list).reset_index()
merged_df = merged_df.rename(columns={'clean_text': 'dialog'})

# convert dialog to string, not list
merged_df['dialog'] = merged_df['dialog'].apply(lambda x: ';'.join(x))
dialog_df = merged_df.drop(['index'], axis=1)

dialog_df

Unnamed: 0,speaker1_ID,speaker2_ID,movie_ID,dialog
0,u0,u2,m0,can we make this quick roxanne korrine and an...
1,u0,u2,m0,you are asking me out that is so cute what is...
2,u0,u2,m0,no no it's my fault we did not have a proper ...
3,u0,u2,m0,why;unsolved mystery she used to be really po...
4,u0,u2,m0,gosh if only we could find kat a boyfriend;let...
...,...,...,...,...
83092,u9028,u9031,m616,do you think she might be interested in someo...
83093,u9028,u9031,m616,choose your targets men that is right watch th...
83094,u9030,u9034,m616,colonel durnford william vereker i hear you h...
83095,u9030,u9034,m616,your orders mr vereker;i am to take the sikali...


In [14]:
dialog_df['dialog'][501]

'i can tell from the tone of your voice dave that you are upset why do not you take a stress pill and get some rest;hal i am in command of this ship i order you to release the manual hibernation control;i am sorry dave but in accordance with subroutine c15324 quote when the crew are dead or incapacitated the computer must assume control unquote i must therefore override your authority now since you are not in any condition to intel ligently exercise it;hal unless you follow my instructions i shall be forced to disconnect you'

In [15]:
dialog_df['movie_ID'][501]

'm3'

### Preparing dataframes

For the character metadata prediction we only need the line and the speaker and not the whole dialog. Then we'll predict the gender and the position on the movie credits based on one line. So let's put in a single dataframe line and character metadata

In [19]:
character_metadata = pd.merge(
    df_lines,
    df_chars,
    left_on='speaker_ID',
    right_on='character_ID',
    how='inner'

)

# Select relevant columns and group by the guide index
character_metadata = character_metadata[['line_ID', 'speaker_ID', 'movie_ID', 'speaker', 'clean_text', 'movie_title','gender','credits_pos']]

# shuffle dataframe
character_metadata = character_metadata.sample(frac=1)
character_metadata

Unnamed: 0,line_ID,speaker_ID,movie_ID,speaker,clean_text,movie_title,gender,credits_pos
207166,L324671,u6110,m405,HOAGIE,if you do not take it we will be swimming in a...,jaws: the revenge,m,4
200197,L288399,u5909,m390,PAUL,time is money george we need extra beer today,hotel rwanda,m,2
304543,L665449,u9022,m615,MEDICAL STUDENT,but your grandfather's work sir,young frankenstein,m,9
173339,L183199,u5016,m333,DEAN,you would,enemy of the state,m,1
281540,L598792,u8321,m565,CAUCHON,silence her take her away,the messenger,M,-1
...,...,...,...,...,...,...,...,...
232882,L428304,u6919,m463,HARDING,did to me,one flew over the cuckoo's nest,m,22
275905,L575214,u8133,m552,FIELDING,deals my client is involved in any number of ...,the limey,?,-1
224692,L391184,u6657,m444,RAYMOND,it's cosmo's moon,moonstruck,m,8
106552,L587993,u3154,m207,LUKE,we were in love were not we,the majestic,m,44


And for the movie metadata prediction we will need the whole dialog. So, let's put the metadata into the dialogs dataframe to make things easier.

In [20]:
movie_metadata = pd.merge(
    dialog_df,
    df_title,
    left_on='movie_ID',
    right_on='movie_ID',
    how='inner'
)

# shuffle dataframe
movie_metadata = movie_metadata.sample(frac=1)
movie_metadata

Unnamed: 0,speaker1_ID,speaker2_ID,movie_ID,dialog,title,year,IMBD_rating,IMBD_votes,genres
69601,u7588,u7597,m513,tonight;yeah little election party i am giving...,shampoo,1975,6.2,4406,"[drama, romance]"
52441,u5674,u5682,m376,i miss you annie;i know we miss you too;good n...,the horse whisperer,1998,6.4,15953,"[drama, romance, western]"
63925,u6975,u6980,m467,nice disguise;i am a tourist;you at least brin...,out of sight,1998,7.2,38595,"[comedy, crime, romance, thriller]"
50087,u5379,u5389,m357,you are in show business;s'right,the french connection,1971,7.9,34062,"[action, crime, thriller]"
55542,u6031,u6034,m401,you do not know her last name you do not know...,isle of the dead,1945,6.6,1405,"[drama, horror, mystery, thriller]"
...,...,...,...,...,...,...,...,...,...
47771,u5117,u5120,m339,not yet;i could have helped you we coulda made...,escape from l.a.,1996,5.3,23551,"[action, adventure, sci-fi, thriller]"
54517,u5930,u5933,m392,we will destroy youdestroy all of you;no no,house of the damned,1996,4.3,216,"[action, horror, thriller]"
18217,u1957,u1971,m128,you looking good little real clean clean as th...,malcolm x,1992,7.7,23317,"[biography, drama, history]"
39153,u4166,u4174,m278,can i see you later;you can see me now,body of evidence,1993,4.1,6052,"[drama, romance, thriller]"


### Vectorizing

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
vectorizer = TfidfVectorizer(max_df=0.8, min_df=5, stop_words='english', max_features=1000)

#### Vectorizing lines

In [24]:
lines_X = vectorizer.fit_transform(character_metadata['clean_text'])
tfidf_df = pd.DataFrame(lines_X.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df

Unnamed: 0,able,absolutely,accept,accident,account,act,acting,action,actually,address,...,wrong,wrote,ya,yeah,year,years,yes,yesterday,york,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304708,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
304709,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
304710,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
304711,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Vectorizing dialogs

Using the vectorizer (computed in 'otherTasks' notebook) we transform de data to have the dialog in a way we can input it to the model

In [26]:
dialog_X = vectorizer.fit_transform(movie_metadata['dialog'])
tfidf_df = pd.DataFrame(dialog_X.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df

Unnamed: 0,able,absolutely,accept,accident,account,act,acting,actually,address,admit,...,wrong,wrote,ya,yeah,year,years,yes,yesterday,york,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.214153,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83092,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
83093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
83094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
83095,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


## Model Creation

### Character metadata models

**Input:** Line

**Output:**
Character metadata:

* gender
* position on movie credits

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

### Movie metadata models

**Input:** Dialog

**Output:** Movie metadata:

* genres
* release year
* IMDB rating
* number of IMDB votes

We will create a model for each metadata field

**Note:** the following is an example with just a 300 samples to try how it would work

In [28]:
sample_size = 300

In [29]:
y_year = movie_metadata.iloc[0:sample_size]['year']
small_vectorizer_X = vectorizer.fit_transform(movie_metadata.iloc[0:sample_size]['dialog'])

X_train, X_test, y_train, y_test = train_test_split(small_vectorizer_X, y_year, test_size=0.2, random_state=42)

release_year_model = RandomForestRegressor()
release_year_model.fit(X_train, y_train)

accuracy = release_year_model.score(X_test, y_test)
print('Release Year Prediction Accuracy:', accuracy)

Release Year Prediction Accuracy: -0.2290359339346144


The score method returns the coefficient of determination R^2
The values that can take are as follows:

* R^2=1 : Indicates that the model perfectly explains the variance in the test data.
* R^2>0 : Indicates that the model has some predictive power, capturing part of the variance in the data.
* R^2=0 : Indicates that the model explains none of the variance, equivalent to using the mean of the observed values.
* R^2<0 : Indicates that the model is worse than a simple mean, meaning the model is failing to capture the variance in the data.

let's test the prediction now with one random sample from the real **dataset**

In [30]:
random_index = np.random.randint(83097)
example1 = movie_metadata.loc[random_index]['dialog']
predicted_year = release_year_model.predict(vectorizer.transform([example1]))
print(f'Index: {random_index}')
print(f'Line: {example1}')
print(f'Real year: {movie_metadata.loc[random_index]["year"]}')
print("---------------------------------------")
print(f'Predicted year: {predicted_year[0]}')

NameError: name 'np' is not defined

Okay, now let's just repeat that with all the other quantitative metadata fields ()

## Meta data prediction task


### Prediction of movie metadata for specific dialog.

In [None]:
def predict_movie_metadata(dialogue):
    dialogue_vector = vectorizer.transform([dialogue])

    genres = genres_model.predict(dialogue_vector) #todo
    release_year = release_year_model.predict(dialogue_vector)
    imdb_rating = imdb_rating_model.predict(dialogue_vector) #todo
    number_of_votes = votes_model.predict(dialogue_vector) #todo

    return {
        'genres': genres,
        'release_year': release_year,
        'IMDB_rating': imdb_rating,
        'number_of_IMDB_votes': number_of_votes
    }


### Prediction of character metadata for an specific line.

In [None]:
def predict_character_metadata(line):
    line_vector = vectorizer.transform([line])

    gender = gender_model.predict(line_vector) #todo
    credits_pos = credits_pos_model.predict(line_vector) #todo

    return {
        'gender': gender_model,
        'credits_pos': credits_pos
    }


ignore the following, there are just notes for understanding

# Cosas para entender mejor lo que se hace

Ejemplo de como funciona el vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Ejemplo de diálogos
dialogues = [
    "Hello, how are you?",
    "I'm good, thank you! How about you?",
    "I'm fine too, thanks for asking."
]

# Crear un vectorizador TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)

# Transformar los diálogos en una matriz de características TF-IDF
X = vectorizer.fit_transform(dialogues)

# Convertir a un DataFrame para visualizar
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print(tfidf_df)

      about       are    asking      fine       for      good     hello  \
0  0.000000  0.562829  0.000000  0.000000  0.000000  0.000000  0.562829   
1  0.411973  0.000000  0.000000  0.000000  0.000000  0.411973  0.000000   
2  0.000000  0.000000  0.447214  0.447214  0.447214  0.000000  0.000000   

        how     thank    thanks       too       you  
0  0.428046  0.000000  0.000000  0.000000  0.428046  
1  0.313316  0.411973  0.000000  0.000000  0.626632  
2  0.000000  0.000000  0.447214  0.447214  0.000000  


La regresión logistica solo se debe usar para clasificacion si las clases son binarias, pero para las variables numericas puede servir (year, rating, etc)

In [None]:
from sklearn.linear_model import LogisticRegression
import numpy as np

In [None]:
def generatePredictionsY(x_train, y_train, x_test):
    # train example 1
    # x = df['dialog'] #input
    # y = df['genre'] #output

    # train example 2
    # x = df['line'] #input
    # y = df['gender'] #output

    model = LogisticRegression(max_iter=1000)
    model.fit(X=np.array(x_train), y=y_train)

    predictions = model.predict(X=np.array(x_test))
    return predictions

The idea is to implement this with cross-validation in order to obtain a prediction for each subtest.

In [None]:
# Example, predict genre:

# train with 10
x_train = movie_metadata.iloc[0:10][['IMBD_rating','IMBD_votes']]
y_train = movie_metadata.iloc[0:10]['year']

# print(x_train)
# print()
# print(y_train)
# print()

# test with 5
x_test = movie_metadata.iloc[10:15][['IMBD_rating','IMBD_votes']]
print(x_test)
print()

p = generatePredictionsY(x_train, y_train, x_test)
print(p)

       IMBD_rating  IMBD_votes
38789          4.5         269
27912          6.4       28682
55300          7.4       80077
67744          7.6        1085
68879          6.1         259

[1993 1993 1993 1993 1993]
