In [None]:
#Mounting Google drive 
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


# Prepare Shakespeare Data

Link for the Shakespeare dataset: https://www.kaggle.com/datasets/garnavaurha/shakespearify


In [None]:
import pandas as pd
import numpy as np

In [None]:
#Loading the dataset
shakespeare_data = pd.read_csv('/content/drive/MyDrive/685 Project/shakespeare_parallel.csv')

# remove rows with NaN
shakespeare_data = shakespeare_data.dropna()
shakespeare_data = shakespeare_data.rename(columns={"og": "Output", "t": "Input"})

shakespeare_data.head()

Unnamed: 0.1,Unnamed: 0,id,Output,Input
0,0,42928-1500614319216-63344,You do not meet a man but frowns:,Every man you meet these days is frowning.
1,1,42928-1500614326583-89821,our bloods No more obey the heavens than our...,Our bodies are in agreement with the planetar...
2,2,A-63849,But what's the matter?,What's wrong?
3,3,42930-1500614347266-80123,"His daughter, and the heir of's kingdom, whom...","The king wanted his daughter, the only heir to..."
4,4,42930-1500614355280-38326,she's wedded; Her husband banish'd; she impr...,"She's married, her husband is banished, she's..."


In [None]:
# utility functions

import re
#Removing all the special symbols and the links from the sentence
def removal_chars_links(line):
  #Replace http and all the continuous characters following it by ''
  line = re.sub(r"http\S+", "", line)
  #Replace all the mentions using @ with ''
  line = re.sub(r"\S*@\S*\s?", "", line)
  return line 

#Function to drop all the rows in the dataframe, whose total length is less than a threshold 'len'
def remove_min_len(dataset, len, cols = 'content'):
  for col in cols:
    remove = dataset[dataset[col].str.split().str.len()<=len].index 
  dataset =  dataset.drop(remove, axis = 0)
  return dataset

def convert_to_t5_format(x):
  if len(x.split())>250:
    x = ' '.join(x.split()[:250])
  return 'Translate: '+x+" ."+"Traslated: "+x

def max_length(dataset):
  return max(dataset.content.str.split().apply(len))

def avg_length(dataset):
  return  dataset['content'].str.split().str.len().mean()

In [None]:
# capture lines only
shakespeare_lines_df = shakespeare_data[['Output', 'Input']]

# remove special characters
shakespeare_lines_df['Input'] = shakespeare_lines_df['Input'].apply(removal_chars_links)
shakespeare_lines_df['Output'] = shakespeare_lines_df['Output'].apply(removal_chars_links)

# convert to lowercase
shakespeare_lines_df['Input'] = shakespeare_lines_df['Input'].str.lower()
shakespeare_lines_df['Output'] = shakespeare_lines_df['Output'].str.lower()

# remove lines with 2 words or less
shakespeare_lines_df = remove_min_len(shakespeare_lines_df, 2, ['Input', 'Output'])

#Formatting dataset 
shakespeare_lines_df['content'] = "Input: " + shakespeare_lines_df['Input'] + " Output: " + shakespeare_lines_df['Output']

shakespeare_lines_df = shakespeare_lines_df[['content']]

#Statistics about max length, average length and the size of the dataset 
print("Max_length:", max_length(shakespeare_lines_df), "Avg_length:", avg_length(shakespeare_lines_df))

print("num rows:", len(shakespeare_lines_df))

shakespeare_lines_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shakespeare_lines_df['Input'] = shakespeare_lines_df['Input'].apply(removal_chars_links)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shakespeare_lines_df['Output'] = shakespeare_lines_df['Output'].apply(removal_chars_links)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shakespeare_lines_df['Inp

Max_length: 302 Avg_length: 35.06442404900192
num rows: 50447


Unnamed: 0,content
0,Input: every man you meet these days is frowni...
1,Input: our bodies are in agreement with the p...
2,Input: what's wrong? Output: but what's the ...
3,"Input: the king wanted his daughter, the only ..."
4,"Input: she's married, her husband is banished..."


In [None]:
shakespeare_lines_df.to_csv('/content/drive/MyDrive/685 Project/Shakespeare_TST/Shakespeare_BT_TST.csv',index = False)

In [None]:
shakespeare_lines_df['content'] = shakespeare_lines_df['content'].apply(convert_to_t5_format)
shakespeare_lines_df.dropna(axis=0, inplace=True)

In [None]:
shakespeare_lines_df.to_csv('/content/drive/MyDrive/685 Project/Data Preprocessing/Data/Shakespeare_no_BT_TST.csv',index = False)

# Prepare Office Data


In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
import pandas as pd
import numpy as np

In [None]:
#Loading dataset 
office_data = pd.read_csv('/content/drive/MyDrive/685 Project/Data/The-Office-Lines-V4.csv')

#Dropping non-essential columns
office_data = office_data.drop(['episode','season', 'title','scene','Unnamed: 6'], axis=1)

# remove rows with NaN
office_data.dropna(inplace=True)

office_data.head()

Unnamed: 0,speaker,line
0,Michael,All right Jim. Your quarterlies look very good...
1,Jim,"Oh, I told you. I couldn't close it. So..."
2,Michael,So you've come to the master for guidance? Is ...
3,Jim,"Actually, you called me in here, but yeah."
4,Michael,"All right. Well, let me show you how it's done."


In [None]:
office_df = office_data.rename(columns={"line": "content"})

# remove special characters
office_df['content'] = office_df['content'].apply(removal_chars_links)

# convert to lowercase
office_df['content'] = office_df['content'].str.lower()

# remove lines with 2 words or less
office_df = remove_min_len(office_df, 2)

chars = ["Jim", "Pam", "Michael", "Dwight"]

for char in chars:
  print(char in office_df['speaker'].unique())

office_df = office_df[office_df['speaker'].isin(chars)]

print("Max_length:", max_length(office_df), "Avg_length:", avg_length(office_df))

print("num rows:", len(office_df))

office_df.head()



True
True
True
True
Max_length: 232 Avg_length: 13.781531728665207
num rows: 22850


Unnamed: 0,speaker,content
0,Michael,all right jim. your quarterlies look very good...
1,Jim,"oh, i told you. i couldn't close it. so..."
2,Michael,so you've come to the master for guidance? is ...
3,Jim,"actually, you called me in here, but yeah."
4,Michael,"all right. well, let me show you how it's done."


In [None]:
#Grouping the dataset based on the speaker 
character_dfs = [x for _, x in office_df.groupby('speaker')]
print(len(character_dfs))

#Dropping the 'speaker' column
for df in character_dfs:
  df.drop('speaker', axis = 1, inplace=True)

dwight_df = character_dfs[0]
jim_df = character_dfs[1]
michael_df = character_dfs[2]
pam_df = character_dfs[3]

4
                                              content
16   shall i play for you? pa rum pump um pum  i h...
87        just clearing my desk. i can't concentrate.
89  it's overlapping. it's all spilling over the e...
90                                 you can't do that.
92  safety violation. i could fall and pierce an o...


In [None]:
filenames = ['Dwight', 'Jim', 'Michael', 'Pam']

#Storing all the preprocessed dataframes
for i in range(len(filenames)):
  character_dfs[i].to_csv(f'/content/drive/MyDrive/685 Project/Data Preprocessing/Data/{filenames[i]}TST.csv',index = False)