## Load data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os

path = 'Project'

os.chdir(f'/content/drive/MyDrive/{path}')
os.getcwd()


Mounted at /content/drive


'/content/drive/.shortcut-targets-by-id/1jzv_GqqZ-2sGmQP_jDNHFFIWqV5jbsgO/Project'

In [None]:
import tensorflow as tf

import numpy as np
import os
import time
import re
import pandas as pd
import ast

Choose whether to use the whole dataset or just a small part of it

In [None]:
dataset = 'cornell-movie-dialogs-small'
# dataset = 'cornell-movie-dialogs'

These are the different files found in the dataset, let's see what's inside of them

In [None]:
with open(f'{dataset}/movie_lines.txt', encoding='utf-8') as f:
    lines = f.readlines()

with open(f'{dataset}/movie_conversations.txt', encoding='utf-8') as f:
    conversations = f.readlines()

with open(f'{dataset}/movie_titles_metadata.txt', encoding='latin-1') as f:
    titles = f.readlines()

with open(f'{dataset}/movie_characters_metadata.txt', encoding='latin-1') as f:
    characters = f.readlines()

The lines and conversation files that are conected with an ID of the line in the conversation.

> According to the dataset documentation these file contains
* 220,579 conversational exchanges between 10,292 pairs of movie characters
* involves 9,035 characters from 617 movies
* in total 304,713 utterances

Also on the titles file we can see

> Movie metadata included:
* genres
* release year
* IMDB rating
* number of IMDB votes
* IMDB rating

And this information for the characters file

> Character metadata included:
* gender (for 3,774 characters)
* position on movie credits (3,321 characters)


For easier visualization, we'll load the Loading them into a pandas dataframe, starting with the lines.


As we saw before, each column of the dataset is splitted by the string `+++$+++` then we'll set that as a parameter for the split function. Also we'll delete the last `\n` symbol from the line

In [None]:
def clean_text(text): # https://github.com/REDFOX1899/Chatbot/blob/master/Chatbot.py
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r'[^\w\s]', '', text)
    return text



In [None]:
# create dataframe with lines
df_lines = pd.DataFrame({'line_text': lines})

# split into columns
df_lines = df_lines['line_text'].str.split(r'\+\+\+\$\+\+\+', expand=True)
df_lines.columns = ['line_ID', 'speaker_ID', 'movie','speaker','text']

# delete new line character and other blank character
df_lines = df_lines.applymap(lambda x: x.rstrip() if isinstance(x, str) else x)
df_lines['text'] = df_lines['text'].str.rstrip('\n')

# Add column clean text
df_lines['clean_text'] = df_lines['text'].apply(lambda x: clean_text(x))

# add column speaker + text
df_lines['line'] = df_lines['speaker'] + ": " + df_lines['text']

df_lines['clean_line'] = df_lines['speaker'] + ": " + df_lines['clean_text']

df_lines.head(5)

Unnamed: 0,line_ID,speaker_ID,movie,speaker,text,clean_text,line,clean_line
0,L1045,u0,m0,BIANCA,They do not!,they do not,BIANCA: They do not!,BIANCA: they do not
1,L1044,u2,m0,CAMERON,They do to!,they do to,CAMERON: They do to!,CAMERON: they do to
2,L985,u0,m0,BIANCA,I hope so.,i hope so,BIANCA: I hope so.,BIANCA: i hope so
3,L984,u2,m0,CAMERON,She okay?,she okay,CAMERON: She okay?,CAMERON: she okay
4,L925,u0,m0,BIANCA,Let's go.,lets go,BIANCA: Let's go.,BIANCA: lets go


Now let's do the same thing with all the other txt files

In [None]:
# create dataframe with conversations
df_conv = pd.DataFrame({'conv': conversations})

# split into columns
df_conv = df_conv['conv'].str.split(r'\+\+\+\$\+\+\+', expand=True)
df_conv.columns = ['speaker1_ID', 'speaker2_ID', 'movie_ID','lines_list']

# delete new line character
df_conv['lines_list'] = df_conv['lines_list'].str.rstrip('\n')

# set lines_list to list type
df_conv['lines_list'] = df_conv['lines_list'].apply(ast.literal_eval)

df_conv.head(5)

Unnamed: 0,speaker1_ID,speaker2_ID,movie_ID,lines_list
0,u0,u2,m0,"[L194, L195, L196, L197]"
1,u0,u2,m0,"[L198, L199]"
2,u0,u2,m0,"[L200, L201, L202, L203]"
3,u0,u2,m0,"[L204, L205, L206]"
4,u0,u2,m0,"[L207, L208]"


In [None]:
# create dataframe with titles
df_title = pd.DataFrame({'title': titles})

# split into columns
df_title = df_title['title'].str.split(r'\+\+\+\$\+\+\+', expand=True)
df_title.columns = ['movie_ID','title','year','IMBD_rating','IMBD_votes','genres']

# cast types to what they are
df_title['IMBD_rating'] = df_title['IMBD_rating'].astype(float)
df_title['IMBD_votes'] = df_title['IMBD_votes'].astype(int)
# Clean 'year' column using regex (for cases like ' 1989/I ')
df_title['year'] = df_title['year'].apply(lambda x: re.sub(r'\D', '', x))  # \D significa "no dígito"
df_title['year'] = df_title['year'].astype(int)

# delete new line character
df_title['genres'] = df_title['genres'].str.rstrip('\n')

# set genres_list to list type
df_title['genres'] = df_title['genres'].apply(ast.literal_eval)
df_title.head(5)

Unnamed: 0,movie_ID,title,year,IMBD_rating,IMBD_votes,genres
0,m0,10 things i hate about you,1999,6.9,62847,"[comedy, romance]"
1,m1,1492: conquest of paradise,1992,6.2,10421,"[adventure, biography, drama, history]"
2,m2,15 minutes,2001,6.1,25854,"[action, crime, drama, thriller]"
3,m3,2001: a space odyssey,1968,8.4,163227,"[adventure, mystery, sci-fi]"
4,m4,48 hrs.,1982,6.9,22289,"[action, comedy, crime, drama, thriller]"


In [None]:
# create dataframe with conversations
df_chars = pd.DataFrame({'characters': characters})

# split into columns
df_chars = df_chars['characters'].str.split(r'\+\+\+\$\+\+\+', expand=True)
df_chars.columns = ['Character_ID','name','movie_ID','movie_title','gender','credits_pos']

# delete new line character
df_chars['credits_pos'] = df_chars['credits_pos'].str.rstrip('\n')
df_chars.head(5)

Unnamed: 0,Character_ID,name,movie_ID,movie_title,gender,credits_pos
0,u0,BIANCA,m0,10 things i hate about you,f,4
1,u1,BRUCE,m0,10 things i hate about you,?,?
2,u2,CAMERON,m0,10 things i hate about you,m,3
3,u3,CHASTITY,m0,10 things i hate about you,?,?
4,u4,JOEY,m0,10 things i hate about you,m,6


For easier handle of the conversations and lines we'll join them together in one dataframe

In [None]:
# Create a guide index
df_conv['index'] = df_conv.index
# Expand list into all sub item lines
expanded_lines = df_conv.explode('lines_list')
# Reset index of expanded df
expanded_lines.reset_index(drop=True, inplace=True)
# merge the line with line ids
merged_df = pd.merge(
    expanded_lines,
    df_lines,
    left_on='lines_list',
    right_on='line_ID',
    how='inner'
)

# Select relevant columns and group by the guide index
merged_df = merged_df[['speaker1_ID', 'speaker2_ID', 'movie_ID', 'lines_list', 'index', 'clean_line']]
merged_df = merged_df.groupby(['index','speaker1_ID', 'speaker2_ID', 'movie_ID'])['clean_line'].apply(list).reset_index()
merged_df = merged_df.rename(columns={'clean_line': 'dialog'})
dialog_df = merged_df.drop(['index'], axis=1)

dialog_df

Unnamed: 0,speaker1_ID,speaker2_ID,movie_ID,dialog
0,u0,u2,m0,[ BIANCA: can we make this quick roxanne kor...
1,u0,u2,m0,[ BIANCA: you are asking me out that is so c...
2,u0,u2,m0,[ BIANCA: no no its my fault we did not have...
3,u0,u2,m0,"[ CAMERON: why, BIANCA: unsolved mystery s..."
4,u0,u2,m0,[ BIANCA: gosh if only we could find kat a bo...
...,...,...,...,...
83092,u9028,u9031,m616,[ COGHILL: do you think she might be interest...
83093,u9028,u9031,m616,[ COGHILL: choose your targets men that is ri...
83094,u9030,u9034,m616,[ VEREKER: colonel durnford william vereker i...
83095,u9030,u9034,m616,"[ DURNFORD: your orders mr vereker, VEREKER:..."


In [None]:
dialog_df["dialog"][85]

[' CAMERON:  and he means that strictly in a non prisonmovie type of way',
 ' PATRICK:  yeah  we will see']

In [None]:
###############################################333

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-any.

In [None]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/302.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m174.1/302.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from 

In [None]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [None]:
import re

def preprocess_line(dialogue):
    # Remove leading and trailing whitespaces
    dialogue = dialogue.strip()

    # Remove any extra whitespaces within the dialogue
    dialogue = re.sub(' +', ' ', dialogue)

    # Separate the speaker and dialogue text
    speaker, text = dialogue.split(':')

    # Remove leading and trailing whitespaces from speaker and text
    speaker = speaker.strip()
    text = text.strip()

    return speaker, text

def preprocess_dialogue_data(dialogue_data):
    preprocessed_data = []
    for dialogues in dialogue_data:
        preprocessed_dialogues = []
        for dialogue in dialogues:
            speaker, text = preprocess_line(dialogue)
            preprocessed_dialogues.append((speaker, text))
        preprocessed_data.append(preprocessed_dialogues)
    return preprocessed_data

# Sample dialogue data
dialogue_data = dialog_df['dialog'].tolist()

# Preprocess the dialogue data
preprocessed_dialogue_data = preprocess_dialogue_data(dialogue_data)

In [None]:
len(preprocessed_dialogue_data)

83097

In [None]:
data = []

idx = 0
for dialogues in preprocessed_dialogue_data[:1000]:
    for i in range(len(dialogues)-1):
        #data.append([idx, {"line": f"{dialogues[i][1]}", "response": f"{dialogues[i+1][1]}"}])
        data.append([idx, dialogues[i][0]+": "+dialogues[i][1],dialogues[i+1][0]+": "+dialogues[i+1][1]])

        idx += 1

# Create a DataFrame from the data
#df = pd.DataFrame(data, columns=['idx', 'dialog'])
df = pd.DataFrame(data, columns=['idx', 'line', 'response'])

df.to_pickle("my_data.pkl")

from datasets import load_dataset
raw_datasets = load_dataset("pandas", data_files="my_data.pkl")
raw_datasets



Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['idx', 'line', 'response'],
        num_rows: 2488
    })
})

In [None]:
df

Unnamed: 0,idx,line,response
0,0,BIANCA: can we make this quick roxanne korrine...,CAMERON: well i thought we would start with pr...
1,1,CAMERON: well i thought we would start with pr...,BIANCA: not the hacking and gagging and spitti...
2,2,BIANCA: not the hacking and gagging and spitti...,CAMERON: okay then how bout we try out some fr...
3,3,BIANCA: you are asking me out that is so cute ...,CAMERON: forget it
4,4,BIANCA: no no its my fault we did not have a p...,CAMERON: cameron
...,...,...,...
2483,2483,RICK: kincaid and joey died last night,ALICE: what
2484,2484,RICK: you alright,ALICE: kristen
2485,2485,RICK: i heard you screaming was it a bad one,ALICE: it was bad
2486,2486,ALICE: it was bad,RICK: does not the dream master work for you a...


In [None]:
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
split_datasets["validation"] = split_datasets.pop("test")
split_datasets

DatasetDict({
    train: Dataset({
        features: ['idx', 'line', 'response'],
        num_rows: 2239
    })
    validation: Dataset({
        features: ['idx', 'line', 'response'],
        num_rows: 249
    })
})

In [None]:
split_datasets["train"][100]

{'idx': 199, 'line': 'KAT: away', 'response': 'JOEY: your sister here'}

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [None]:
line = split_datasets["train"][0]["line"]
response = split_datasets["train"][0]["response"]

inputs = tokenizer(line)

tokenizer.decode(inputs["input_ids"])

'[CLS] CORNELIUS : imagine for a moment that this thing is not anything that can be identified because it prefers not to be because it is the antithesis of all we are because it is evil total evil [SEP]'

In [None]:
inputs = tokenizer(
    line,
    max_length=100,
    truncation="only_second",
    stride=50,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
)
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])

In [None]:
max_length = 64
stride = 32


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["line"]]
    inputs = tokenizer(
        questions,
        max_length=max_length,
        truncation = True,
        #truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["response"]
    start_positions = [] ########################################################
    end_positions = [] ############################################

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]

        sequence_ids = inputs.sequence_ids(i)
        # If the answer is not fully inside the context, label is (0, 0) ############################
        start_positions.append(0)
        end_positions.append(0)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
print(split_datasets["train"].column_names)
print(split_datasets["train"][0])

['idx', 'line', 'response']
{'idx': 1794, 'line': 'CORNELIUS: imagine for a moment that this thing is not anything that can be identified because it prefers not to be because it is the antithesis of all we are because it is evil total evil', 'response': 'PRESIDENT: one more reason to shoot first eh'}


In [None]:
split_datasets["train"][0]

{'idx': 1794,
 'line': 'CORNELIUS: imagine for a moment that this thing is not anything that can be identified because it prefers not to be because it is the antithesis of all we are because it is evil total evil',
 'response': 'PRESIDENT: one more reason to shoot first eh'}

In [None]:
train_dataset = split_datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)
len(split_datasets["train"]), len(train_dataset)

Map:   0%|          | 0/2239 [00:00<?, ? examples/s]

(2239, 2268)

In [None]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["line"]]
    inputs = tokenizer(
        questions,
        max_length=max_length,
        #truncation="only_second",
        truncation = True,
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["idx"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)]

    inputs["example_id"] = example_ids
    return inputs

validation_dataset = split_datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=split_datasets["validation"].column_names,)
len(split_datasets["validation"]), len(validation_dataset)

Map:   0%|          | 0/249 [00:00<?, ? examples/s]

(249, 250)

In [None]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=max_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

In [None]:
from transformers import DataCollatorForLanguageModeling
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2").to(device)
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="codeparrot-ds",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=10,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
)
#args = TrainingArguments(
#    output_dir="codeparrot-ds",
#    evaluation_strategy="no",
#    learning_rate=2e-5,
#    logging_steps=20,
#    num_train_epochs=10,
#    weight_decay=0.01,
#)
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=80, training_loss=2.931747055053711, metrics={'train_runtime': 316.1415, 'train_samples_per_second': 71.74, 'train_steps_per_second': 0.253, 'total_flos': 667731787776000.0, 'train_loss': 2.931747055053711, 'epoch': 9.014084507042254})

In [None]:
#predictions, _, _ = trainer.predict(validation_dataset)

In [None]:
# Input text
input_text = "ROMEO: what do you think when you look at the stars?"

# Tokenize input text
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

# Generate text
with torch.no_grad():
    outputs = model.generate(input_ids, num_return_sequences=1, temperature=0.7)  #high temperature = high randomness

for output in outputs:
    generated_text = tokenizer.decode(output, skip_special_tokens=True)
    print("Generated Text:", generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text: ROMEO: what do you think when you look at the stars? They're really big enough to


In [None]:
# Input text
input_text = "BOB: you are crazy"

# Tokenize input text
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

# Generate text
with torch.no_grad():
    outputs = model.generate(input_ids, num_return_sequences=1, temperature=0.7)

for output in outputs:
    generated_text = tokenizer.decode(output, skip_special_tokens=True)
    print("Generated Text:", generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text: BOB: you are crazy enough to understand the power of your life itself.

BEN


In [None]:
# Input text
input_text = "SARA: Yeah lets see the new movie"

# Tokenize input text
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

# Generate text
with torch.no_grad():
    outputs = model.generate(input_ids, num_return_sequences=1, temperature=0.7)

for output in outputs:
    generated_text = tokenizer.decode(output, skip_special_tokens=True)
    print("Generated Text:", generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text: SARA: Yeah lets see the new movie.

How do you feel about the movie itself


In [None]:
model_id = 'gpt2-medium'

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda")
