## Data Preparation

In [None]:
import pandas as pd

In [None]:
# Loading the processed dataset
df_movie = pd.read_csv('/content/drive/MyDrive/Visual Story Telling/Dataset - Story Generation/Processed_Movie_Dataset')
df_books = pd.read_csv('/content/drive/MyDrive/Visual Story Telling/Dataset - Story Generation/Processed_Book_Dataset')

In [None]:
# Drop wikipedia ID
df_movie.drop(['Wikipedia movie ID'],axis=1,inplace=True)
df_books.drop(['Wikipedia article ID'],axis=1,inplace=True)

In [None]:
# Rename the columns to make them compatible for concatenation
df_movie.rename(columns = {'Movie_Name':'Title'}, inplace = True)
df_books.rename(columns = {'Plot summary':'Summary','Book title':'Title', 'Book genres':'Genre'}, inplace = True)

In [None]:
#Concatenate the datasets
final_df = pd.concat([df_movie,df_books])
final_df.reset_index(drop=True, inplace=True)

In [None]:
# Formatting Relations 
def char_extraction_format(relations):
  formatted = []
  for rel in relations:
    s = rel.split()
    idx1 = s.index("and")
    idx2 = s.index("have")
    char1 = ' '.join(s[:idx1])
    char2 = ' '.join(s[idx1+1:idx2])
    formatted.append(f"<{char1},{char2}>")
  if formatted==[]:
    return "<>"
  else:
    return ', '.join(formatted)

def format_relations(x):
  """
  
  To get the relations to the following format:
  Neutral: <Char1, Char2>, <Char3, Char4>
  Positive: <>
  Negative: <Char2, Char4>

  """
  
  relations = x.split(".")
  neutral = []
  positive = []
  negative = []
  
  d = {"Neutral:":[],"Positive:":[],"Negative:":[]}

  for sent in relations:
    if "neutral" in sent:
      neutral.append(sent)
    elif "positive" in sent:
      positive.append(sent)
    else:
      negative.append(sent)
  
  d["Neutral:"] = char_extraction_format(neutral)
  d["Positive:"] = char_extraction_format(positive)
  d["Negative:"] = char_extraction_format(negative)

  new_relations = ""
  for key in d:
    new_relations = new_relations + key + " " + d[key] + ". "

  return new_relations 

In [None]:
final_df['Relations'] = final_df['Relations'].apply(format_relations)

In [None]:
final_df

In [None]:
# Adding a period at the end of each column (already done for Relations)
final_df['Title'] = final_df['Title']+"."
final_df['Genre'] = final_df['Genre']+"."
final_df['Characters'] = final_df['Characters']+"."

In [None]:
# Merging all columns in a single input for T5
prepare_text = lambda x: ' '.join(["Title:" , x['Title'], "Genre:", x['Genre'], "Characters:", x['Characters'], "Relations:", x['Relations']])
final_df['Input'] = final_df.apply(prepare_text, axis=1)

In [None]:
# Dropping redundant columns
final_df.drop(['Genre', 'Title', 'Characters', 'Relations'], axis=1, inpalce=True)

In [None]:
#Save the dataset
final_df.to_csv('/content/drive/MyDrive/Visual Story Telling/Dataset - Story Generation/Training_Dataset', index=False)