## Data Preparation

In [1]:
import pandas as pd

In [2]:
# Loading the processed dataset
df_movie = pd.read_csv('/content/drive/MyDrive/Visual Story Telling/Dataset - Story Generation/M_Dataset_0_2K')
df_books = pd.read_csv('/content/drive/MyDrive/Visual Story Telling/Dataset - Story Generation/B_Dataset_0K_2K')

In [5]:
# Drop wikipedia ID
df_movie.drop(['Wikipedia movie ID'],axis=1,inplace=True)
df_books.drop(['Wikipedia article ID'],axis=1,inplace=True)

In [8]:
# Rename the columns to make them compatible for concatenation
df_movie.rename(columns = {'Movie_Name':'Title'}, inplace = True)
df_books.rename(columns = {'Plot summary':'Summary','Book title':'Title', 'Book genres':'Genre'}, inplace = True)

In [13]:
#Concatenate the datasets
final_df = pd.concat([df_movie,df_books])
final_df.reset_index(drop=True, inplace=True)

In [24]:
# Formatting Relations 
def char_extraction_format(relations):
  formatted = []
  for rel in relations:
    s = rel.split()
    idx1 = s.index("and")
    idx2 = s.index("have")
    char1 = ' '.join(s[:idx1])
    char2 = ' '.join(s[idx1+1:idx2])
    formatted.append(f"<{char1},{char2}>")
  if formatted==[]:
    return "<>"
  else:
    return ', '.join(formatted)

def format_relations(x):
  """
  
  To get the relations to the following format:
  Neutral: <Char1, Char2>, <Char3, Char4>
  Positive: <>
  Negative: <Char2, Char4>

  """
  
  relations = x.split(".")
  neutral = []
  positive = []
  negative = []
  
  d = {"Neutral:":[],"Positive:":[],"Negative:":[]}

  for sent in relations:
    if "neutral" in sent:
      neutral.append(sent)
    elif "positive" in sent:
      positive.append(sent)
    else:
      negative.append(sent)
  
  d["Neutral:"] = char_extraction_format(neutral)
  d["Positive:"] = char_extraction_format(positive)
  d["Negative:"] = char_extraction_format(negative)

  new_relations = ""
  for key in d:
    new_relations = new_relations + key + " " + d[key] + ". "

  return new_relations 

In [None]:
final_df['Relations'] = final_df['Relations'].apply(format_relations)

In [23]:
final_df

Unnamed: 0,Summary,Genre,Title,Characters,Relations
0,"Shlykov, a hard-working taxi driver and Lyosha...","Drama, World cinema",Taxi Blues,,
1,The nation of Panem consists of a wealthy Capi...,"Action/Adventure, Science Fiction, Action, Drama",The Hunger Games,"Katniss, Peeta, Rue, Cato, Haymitch, Crane, Cl...",Katniss and Rue have neutral relationship. Pee...
2,Poovalli Induchoodan is sentenced for six yea...,"Musical, Action, Drama, Bollywood",Narasimham,"Induchoodan, Manapally Pavithran, Manapally Ma...",Induchoodan and Pavithran have neutral relatio...
3,"The Lemon Drop Kid , a New York City swindler,...","Screwball comedy, Comedy",The Lemon Drop Kid,"the Kid, Charley, Moran, Brainy, Nellie Thursd...",Moran and Charley have neutral relationship
4,Seventh-day Adventist Church pastor Michael Ch...,"Crime Fiction, Drama, Docudrama, World cinema,...",A Cry in the Dark,"Lindy, Michael, Azaria.",Michael and Lindy have neutral relationship
...,...,...,...,...,...
3961,Much of narrative describes Hemingway's adven...,"Travel, Biography, Travel literature, Non-fict...",Green Hills of Africa,"Hemingway, Karl.",
3962,"Biblical references: Matt. 2:1-12, Luke 2:1-2...","Historical novel, Children's literature, Histo...",Ben-Hur: A Tale of the Christ,"Judah, Jesus, Messala, Simonides, Ilderim, the...",Herod the Great and the Sanhedrin have neutral...
3963,SAM Madison has always been totally ordinary....,"Children's literature, Young adult literature,...",All American Girl,SAM Madison.,
3964,While working in the dead letter office in Ne...,"Speculative fiction, Fantasy",The Great and Secret Show,"Jaffe, Tesla, Kissoon, Fletcher, Howard, Tommy...",Grillo and Tesla have neutral relationship. To...


In [None]:
# Adding a period at the end of each column (already done for Relations)
final_df['Title'] = final_df['Title']+"."
final_df['Genre'] = final_df['Genre']+"."
final_df['Characters'] = final_df['Characters']+"."

In [None]:
# Merging all columns in a single input for T5
prepare_text = lambda x: ' '.join(["Title:" , x['Title'], "Genre:", x['Genre'], "Characters:", x['Characters'], "Relations:", x['Relations']])
final_df['Input'] = final_df.apply(prepare_text, axis=1)

In [None]:
# Dropping redundant columns
final_df.drop(['Genre', 'Title', 'Characters', 'Relations'], axis=1, inpalce=True)

In [None]:
#Save the dataset
final_df.to_csv('/content/drive/MyDrive/Visual Story Telling/Dataset - Story Generation/Training_Dataset', index=False)