In [None]:
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

In [None]:
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
# import nltk
# nltk.download('punkt')

st = StanfordNERTagger('stanford-ner-4.0.0/classifiers/english.all.3class.distsim.crf.ser.gz',
                       'stanford-ner-4.0.0/stanford-ner.jar',
                       encoding='utf-8')


def replace_tags(text):
    
    tokenized_text = word_tokenize(text)
    classified_text = st.tag(tokenized_text)
    
    person = [pair[0] for pair in classified_text if pair[1] == 'PERSON']
    location = [pair[0] for pair in classified_text if pair[1] == 'LOCATION']
    organization = [pair[0] for pair in classified_text if pair[1] == 'ORGANIZATION']
        
        
    text = text.replace('<', '').replace('>', '')
    for p in person:
        text = text.replace(p, '<person>')
    for l in location:
        text = text.replace(l, '<location>')
    for o in organization:
        text = text.replace(o, '<organization>')

    return text

In [None]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub("[\(\[].*?[\)\]]", "", text) # exclude content between () and []
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"there's", "there is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r" re ", " are ", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'em", " them", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"'cuz", "because", text)
    text = re.sub(r"'cos", "because", text)
    text = re.sub(r"wanna", "want", text)
    text = re.sub(r"d'you", "do you", text)
    text = re.sub(r"d'ya", "do you", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"gimme", "give me", text)
    text = re.sub(r"lemme", "let me", text)
    text = re.sub(r"gonna", "going to", text)    
    text = re.sub(r"ya", "you", text)
    text = re.sub(r"yknow", "you know", text)
    text = re.sub(r"\x96", "", text)
    text = re.sub(r"\x91", "", text)
    text = re.sub(r"you�re", "you are", text)
    text = re.sub(r"don�t", "do not", text)
    text = re.sub(r"it�s", "it is", text)
    text = re.sub(r"i�m", "i am", text)
    text = re.sub(r"that�s", "that is", text)
    text = re.sub(r"what�s", "what is", text)
    text = re.sub(r"didn�t", "did not", text)
    text = re.sub(r"he�s", "he is", text)
    text = re.sub(r"�", "", text)
    text = re.sub(r"maam", "madam", text)
    text = re.sub(r"kinda", "kind of", text)
    text = re.sub(r"sorta", "sort of", text)
    text = re.sub(r"outta", "out of", text)
    text = re.sub(r"dunno", "do not know", text)
    text = re.sub(r"betcha", "bet you", text)
    text = re.sub(r"gotcha", "got you", text)
    text = re.sub(r"coulda", "could have", text)
    text = re.sub(r"woulda", "would have", text)
    text = re.sub(r"cuppa", "cup of", text)
    text = re.sub(r"whassup", "what is up", text)
    text = re.sub(r"-", " ", text) 
    text = re.sub(r"[-¯()\"#/@;:<>{}`+=~|,']", "", text) # remove special characters. keep ponctuation (.!?)
    text = re.sub(r"cmon", "come on", text)
    text = re.sub(r"  ", " ", text)
    
    return text

#### Seinfeld

In [None]:
# https://data.world/juanjosecas/seinfeld-scripts
seinfeld = pd.read_csv('comedy_data/seinfeld_scripts.csv')

#### Friends

In [None]:
from convokit import Corpus, download
corpus = Corpus(filename=download("friends-corpus"))

corpus_df = corpus.get_utterances_dataframe()
corpus_df.shape

In [None]:
def count_words(series):
    return series.str.split(' ').apply(len)

def build_reply(df, scene_key, line_col):
    # a dialogue reply is the next dialogue in the same scene
    df_ = df.copy()
    df_['text_reply'] = df_.groupby(scene_key)[line_col].shift(-1)
    df_ = df_.dropna(subset = ['text_reply'])
    return df_
    
def filter_long_dialogues(df, line_col_len, reply_col_len, thr):
    df_ = df.copy()
    df_['length_is_ok'] = ((df_[line_col_len] <= thr) & 
                           (df_[line_col_len] > 1) &
                           (df_[reply_col_len] <= thr) &
                           (df_[reply_col_len] > 1))
    df_ = df_[df_.length_is_ok]
    return df_.drop(['length_is_ok'], axis = 1)

In [None]:
def clean_dataset_friends(df, thr):
    
    df_ = df.copy()
    
    # clean and uniformize text
    df_.text = df_.text.apply(replace_tags)
    df_.text = df_.text.apply(clean_text)
    
    # build reply
    replies = df_[['reply_to','text']].rename(columns = {'text': 'text_reply', 'reply_to': 'id'})
    replies = replies.reset_index(drop = True).dropna()

    df_ = df_.reset_index().dropna(subset = ['text'])
    df_ = df_.merge(replies, on = 'id', how = 'inner')
    
    
    # calculate number of words in each line and response 
    df_['line_text_len'] = count_words(df_.text)
    df_['line_reply_len'] = count_words(df_.text_reply)
    
    # filter out the dialogues with lengths that exceed the threshold
    df_ = filter_long_dialogues(df_, 'line_text_len', 'line_reply_len', thr)
    
    df_['show'] = 'friends'
    
    df_ = df_.rename(columns = {'id':'line_id'})
    
    # select relevant columns
    columns = [
        'show',
        'line_id',
        'text',
        'text_reply',
        'line_text_len',
        'line_reply_len'
    ]

    return df_[columns]
    

In [None]:
friends_clean = clean_dataset_friends(corpus_df, thr = 20)
friends_clean.shape

In [None]:
friends_clean.head()

#### The Office

In [None]:
# https://data.world/abhinavr8/the-office-scripts-dataset
the_office = pd.read_csv('comedy_data/the_office_scripts.csv')
    

def clean_dataset_office(df, thr):
    
    df_ = df.copy()
    
    # clean and uniformize text
    df_.line_text = df_.line_text.apply(clean_text)

    # create key for scenes
    df_['scene_key'] = ('s' + df_.season.astype(str).str.zfill(2) + '_' +
                        'e' + df_.episode.astype(str).str.zfill(2) + '_' +
                        'c' + df_.scene.astype(str).str.zfill(2))
    
    
    # create index for dialogue order
    df_['dialogue_order'] = df_.groupby('scene_key')['line_text'].cumcount()
    
    # create key for dialogue utterance
    df_['line_id'] = df_['scene_key'] + '_u' + df_.dialogue_order.astype(str).str.zfill(2)


    # build responses for each dialogue line (response is next dialogue)
    df_ = build_reply(df_, 'scene_key', 'line_text')
    
    
    # remove dialogues with typos
    df_['has_typo'] = (df_.line_text.str.contains('���')) | (df_.text_reply.str.contains('���'))
    df_ = df_[~df_.has_typo].reset_index(drop = True)
    
    
    # calculate number of words in each line and response 
    df_['line_text_len'] = count_words(df_.line_text)
    df_['line_reply_len'] = count_words(df_.text_reply)
    
    # filter out the dialogues with lengths that exceed the threshold
    df_ = filter_long_dialogues(df_, 'line_text_len', 'line_reply_len', thr)
    
    df_['show'] = 'the_office'
    
    # select relevant columns
    columns = [
        'show',
        'season',
        'episode',
        'scene',
        'scene_key',
        'dialogue_order',
        'line_id',
        'line_text',
        'text_reply',
        'line_text_len',
        'line_reply_len'
    ]

    return df_[columns].rename(columns = {'line_text':'text'})
    
    
the_office_clean = clean_dataset_office(the_office, thr = 20)
the_office_clean.shape

In [None]:
the_office_clean.head()

#### himym

In [None]:
himym = pd.read_csv('comedy_data/himym.csv')

In [None]:
himym.tail()

#### Gilmore Girls

In [None]:
gg = pd.read_csv('comedy_data/gilmore_girls.csv')


def clean_dataset_gg(df, thr):
    
    df_ = df.copy()
    
    df_ = df_.drop('scene', axis = 1).rename(columns = {'dialogues':'scene', 'sequence':'dialogue_order'})
    df_['season'] = df_.episode.apply(lambda text: int(text.split('x')[0]))
    df_['episode'] = df_.episode.apply(lambda text: int(text.split('x')[1].split('-')[0]))
    

    
    # clean and uniformize text
    df_['text'] = df_['line'].fillna('')
    df_['text'] = df_.text.apply(clean_text)

    # create key for scenes
    df_['scene_key'] = ('s' + df_.season.astype(str).str.zfill(2) + '_' +
                        'e' + df_.episode.astype(str).str.zfill(2) + '_' +
                        'c' + df_.scene.astype(str).str.zfill(2))
    
    # create key for dialogue utterance
    df_['line_id'] = df_['scene_key'] + '_u' + df_.dialogue_order.astype(str).str.zfill(2)
    
    df_ = df_.sort_values(by = ['season', 'episode', 'scene', 'dialogue_order'])
    
    # build responses for each dialogue line (response is next dialogue)
    df_ = build_reply(df_, 'scene_key', 'text')
    
    
    # calculate number of words in each line and response 
    df_['line_text_len'] = count_words(df_.text)
    df_['line_reply_len'] = count_words(df_.text_reply)
    
    # filter out the dialogues with lengths that exceed the threshold
    df_ = filter_long_dialogues(df_, 'line_text_len', 'line_reply_len', thr)
    
    df_['show'] = 'gilmore_girls'
    
    # select relevant columns
    columns = [
        'show',
        'season',
        'episode',
        'scene',
        'scene_key',
        'dialogue_order',
        'line_id',
        'text',
        'text_reply',
        'line_text_len',
        'line_reply_len'
    ]

    return df_[columns]
    
    
gg_clean = clean_dataset_gg(gg, thr = 20)
gg_clean.shape

In [None]:
gg_clean.head()

#### Other dataset

In [None]:
import csv

movies = pd.read_csv("cornell_movie_dialogs_corpus/movie_titles_metadata.txt",sep=" \+\+\+\$\+\+\+ ",
                     engine="python", header=None, names = ["movie_id","title","year","imdb_rating","imdb_votes","genres"])
text = pd.read_csv("cornell_movie_dialogs_corpus/movie_lines.txt", sep=" \+\+\+\$\+\+\+ ", 
                   engine="python", header=None, names = ["line_id","char_id","movie_id","char_name","text"], 
                  quoting=csv.QUOTE_NONE)
struct = pd.read_csv("cornell_movie_dialogs_corpus/movie_conversations.txt", sep=" \+\+\+\$\+\+\+ ", 
                     engine="python", header=None, names = ["char_id_1","char_id_2","movie_id","utterances"])


struct_proc = struct.reset_index().rename(index=str, columns={"index":"dialogue"})
struct_proc.utterances = struct_proc.utterances.apply(lambda x: eval(x))
s = struct_proc.utterances.apply(lambda x: pd.Series(x)).stack().reset_index(level=[0,1])
s = s.rename(index=str,columns={"level_0":"dialogue", "level_1":"dialogue_order", 0:"line_id"})
s.dialogue = s.dialogue.astype(int)
struct_proc = pd.merge(struct_proc.drop("utterances",axis=1),s,how="right",on="dialogue")


# MERGE DATA
cornell_movie = pd.merge(struct_proc,text,on=["line_id", "movie_id"],how="inner")
print(cornell_movie.shape)

In [None]:
def clean_dataset_cornell(df, thr):
    
    df_ = df.copy()
    
    # clean and uniformize text
    df_['text'] = df_['text'].fillna('')
    df_['text'] = df_.text.apply(clean_text)
    
    
    # build responses for each dialogue line (response is next dialogue)
    df_ = df_.sort_values(by = ['dialogue'])
    df_ = build_reply(df = df_, scene_key = 'dialogue', line_col = 'text')
    
    
    # calculate number of words in each line and response 
    df_['line_text_len'] = count_words(df_.text)
    df_['line_reply_len'] = count_words(df_.text_reply)
    
    
    # filter out the dialogues with lengths that exceed the threshold
    df_ = filter_long_dialogues(df_, 'line_text_len', 'line_reply_len', thr)
    
    df_['show'] = 'cornell_movies'
    
    # select relevant columns
    columns = [
        'show',
        'dialogue',
        'movie_id',
        'dialogue_order',
        'line_id',
        'text',
        'text_reply',
        'line_text_len',
        'line_reply_len'
    ]

    return df_[columns]

In [None]:
cornell_movie_clean = clean_dataset_cornell(cornell_movie, 20)
cornell_movie_clean.shape

In [None]:
cornell_movie.head()

### Create final dataset

In [None]:
cols = ['show', 'line_id', 'text', 'text_reply', 'line_text_len', 'line_reply_len']

dataset = pd.concat([
    friends_clean[cols],
    the_office_clean[cols],
    gg_clean[cols],
    cornell_movie_clean[cols]], axis = 0, ignore_index = True)

dataset.shape

In [None]:
dataset.to_csv('comedy_data/dataset.csv', index = False)

In [None]:
def extra_clean(text):

    text = text.replace("...","")
    text = text.replace("."," .")
    text = text.replace("!"," !")
    text = text.replace("?"," ?")

    return text

dialogues = dataset.copy()
dialogues['text_1'] = dialogues['text'].apply(lambda t: extra_clean(t))

In [None]:
dialogues.head()

In [None]:
import pandas as pd
a = pd.read_csv('comedy_data/dataset.csv')