## Load data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

import os

path = 'Colab Notebooks/NLP/Project'

os.chdir(f'/content/drive/MyDrive/{path}')
os.getcwd()


Mounted at /content/drive


'/content/drive/MyDrive/Colab Notebooks/NLP/Project'

In [1]:
dataset = 'cornell-movie-dialogs-small'

These are the different files found in the dataset, let's see what's inside of them

In [4]:
with open(f'{dataset}/movie_lines.txt', encoding='utf-8') as f:
    lines = f.readlines()

with open(f'{dataset}/movie_conversations.txt', encoding='utf-8') as f:
    conversations = f.readlines()

with open(f'{dataset}/movie_titles_metadata.txt', encoding='latin-1') as f:
    titles = f.readlines()

with open(f'{dataset}/movie_characters_metadata.txt', encoding='latin-1') as f:
    characters = f.readlines()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

The lines and conversation files that are conected with an ID of the line in the conversation.

> According to the dataset documentation these file contains
* 220,579 conversational exchanges between 10,292 pairs of movie characters
* involves 9,035 characters from 617 movies
* in total 304,713 utterances

Also on the titles file we can see

> Movie metadata included:
* genres
* release year
* IMDB rating
* number of IMDB votes
* IMDB rating

And this information for the characters file

> Character metadata included:
* gender (for 3,774 characters)
* position on movie credits (3,321 characters)


For easier visualization, we'll load the Loading them into a pandas dataframe, starting with the lines.


As we saw before, each column of the dataset is splitted by the string `+++$+++` then we'll set that as a parameter for the split function. Also we'll delete the last `\n` symbol from the line

In [5]:
import re

In [6]:
def clean_text(text): # https://github.com/REDFOX1899/Chatbot/blob/master/Chatbot.py
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    return text

In [9]:
import pandas as pd

In [10]:
# create dataframe with lines
df_lines = pd.DataFrame({'line_text': lines})

# split into columns
df_lines = df_lines['line_text'].str.split(r'\+\+\+\$\+\+\+', expand=True)
df_lines.columns = ['line_ID', 'speaker_ID', 'movie','speaker','text']

# delete new line character and other blank character
df_lines = df_lines.applymap(lambda x: x.rstrip() if isinstance(x, str) else x)
df_lines['text'] = df_lines['text'].str.rstrip('\n')

# Add column clean text
df_lines['clean_text'] = df_lines['text'].apply(lambda x: clean_text(x))

# add column speaker + text
df_lines['line'] = df_lines['speaker'] + ": " + df_lines['text']

df_lines.head(5)

Unnamed: 0,line_ID,speaker_ID,movie,speaker,text,clean_text,line
0,L1045,u0,m0,BIANCA,They do not!,they do not,BIANCA: They do not!
1,L1044,u2,m0,CAMERON,They do to!,they do to,CAMERON: They do to!
2,L985,u0,m0,BIANCA,I hope so.,i hope so,BIANCA: I hope so.
3,L984,u2,m0,CAMERON,She okay?,she okay,CAMERON: She okay?
4,L925,u0,m0,BIANCA,Let's go.,let's go,BIANCA: Let's go.


Now let's do the same thing with all the other txt files

In [12]:
import ast

In [13]:
# create dataframe with conversations
df_conv = pd.DataFrame({'conv': conversations})

# split into columns
df_conv = df_conv['conv'].str.split(r'\+\+\+\$\+\+\+', expand=True)
df_conv.columns = ['speaker1_ID', 'speaker2_ID', 'movie_ID','lines_list']

# delete new line character
df_conv['lines_list'] = df_conv['lines_list'].str.rstrip('\n')

# set lines_list to list type
df_conv['lines_list'] = df_conv['lines_list'].apply(ast.literal_eval)

df_conv.head(5)

Unnamed: 0,speaker1_ID,speaker2_ID,movie_ID,lines_list
0,u0,u2,m0,"[L194, L195, L196, L197]"
1,u0,u2,m0,"[L198, L199]"
2,u0,u2,m0,"[L200, L201, L202, L203]"
3,u0,u2,m0,"[L204, L205, L206]"
4,u0,u2,m0,"[L207, L208]"


In [14]:
# create dataframe with conversations
df_title = pd.DataFrame({'title': titles})

# split into columns
df_title = df_title['title'].str.split(r'\+\+\+\$\+\+\+', expand=True)
df_title.columns = ['movie_ID','title','year','IMBD_rating','IMBD_votes','genres']

# cast types to what they are
df_title['IMBD_rating'] = df_title['IMBD_rating'].astype(float)
df_title['IMBD_votes'] = df_title['IMBD_votes'].astype(int)
# Clean 'year' column using regex (for cases like ' 1989/I ')
df_title['year'] = df_title['year'].apply(lambda x: re.sub(r'\D', '', x))  # \D significa "no dígito"
df_title['year'] = df_title['year'].astype(int)

# delete new line character
df_title['genres'] = df_title['genres'].str.rstrip('\n')

# set genres_list to list type
df_title['genres'] = df_title['genres'].apply(ast.literal_eval)
df_title.head(5)

Unnamed: 0,movie_ID,title,year,IMBD_rating,IMBD_votes,genres
0,m0,10 things i hate about you,1999,6.9,62847,"[comedy, romance]"
1,m1,1492: conquest of paradise,1992,6.2,10421,"[adventure, biography, drama, history]"
2,m2,15 minutes,2001,6.1,25854,"[action, crime, drama, thriller]"
3,m3,2001: a space odyssey,1968,8.4,163227,"[adventure, mystery, sci-fi]"
4,m4,48 hrs.,1982,6.9,22289,"[action, comedy, crime, drama, thriller]"


In [15]:
# create dataframe with conversations
df_chars = pd.DataFrame({'characters': characters})

# split into columns
df_chars = df_chars['characters'].str.split(r'\+\+\+\$\+\+\+', expand=True)
df_chars.columns = ['Character_ID','name','movie_ID','movie_title','gender','credits_pos']

# delete new line character
df_chars['credits_pos'] = df_chars['credits_pos'].str.rstrip('\n')
df_chars.head(5)

Unnamed: 0,Character_ID,name,movie_ID,movie_title,gender,credits_pos
0,u0,BIANCA,m0,10 things i hate about you,f,4
1,u1,BRUCE,m0,10 things i hate about you,?,?
2,u2,CAMERON,m0,10 things i hate about you,m,3
3,u3,CHASTITY,m0,10 things i hate about you,?,?
4,u4,JOEY,m0,10 things i hate about you,m,6


For easier handle of the conversations and lines we'll join them together in one dataframe

In [16]:
# Create a guide index
df_conv['index'] = df_conv.index
# Expand list into all sub item lines
expanded_lines = df_conv.explode('lines_list')
# Reset index of expanded df
expanded_lines.reset_index(drop=True, inplace=True)
# merge the line with line ids
merged_df = pd.merge(
    expanded_lines,
    df_lines,
    left_on='lines_list',
    right_on='line_ID',
    how='inner'
)

# Select relevant columns and group by the guide index
merged_df = merged_df[['speaker1_ID', 'speaker2_ID', 'movie_ID', 'lines_list', 'index', 'line']]
merged_df = merged_df.groupby(['index','speaker1_ID', 'speaker2_ID', 'movie_ID'])['line'].apply(list).reset_index()
merged_df = merged_df.rename(columns={'line': 'dialog'})
dialog_df = merged_df.drop(['index'], axis=1)

dialog_df

Unnamed: 0,speaker1_ID,speaker2_ID,movie_ID,dialog
0,u0,u2,m0,[ BIANCA: Can we make this quick? Roxanne Ko...
1,u0,u2,m0,[ BIANCA: You're asking me out. That's so cu...
2,u0,u2,m0,"[ BIANCA: No, no, it's my fault -- we didn't ..."
3,u0,u2,m0,"[ CAMERON: Why?, BIANCA: Unsolved mystery. ..."
4,u0,u2,m0,"[ BIANCA: Gosh, if only we could find Kat a b..."
...,...,...,...,...
83092,u9028,u9031,m616,[ COGHILL: Do you think she might be interest...
83093,u9028,u9031,m616,[ COGHILL: Choose your targets men. That's ri...
83094,u9030,u9034,m616,[ VEREKER: Colonel Durnford... William Vereke...
83095,u9030,u9034,m616,"[ DURNFORD: Your orders, Mr Vereker?, VEREKE..."
