### CMU Summaries

In [None]:
# Extracting all the files
import tarfile
my_tar = tarfile.open('/content/drive/MyDrive/Visual Story Telling/Dataset - Story Generation/MovieSummaries.tar.gz')
my_tar.extractall('./CMU_Summaries')
my_tar.close()

In [None]:
# Readme file for explanation of the dataset
with open('/content/CMU_Summaries/MovieSummaries/README.txt','r') as f:
  readme_data = f.read()
# print(readme_data)

In [None]:
# Loading the movie metadata
import pandas as pd
df_movie_meta = pd.read_table('/content/CMU_Summaries/MovieSummaries/movie.metadata.tsv', header=None, names = ['Wikipedia movie ID','Freebase movie ID','Movie name','Movie release date','Movie box office revenue','Movie runtime','Movie language','Movie countries','Movie genres'])

In [None]:
df_movie_meta

In [None]:
# Loading the character metadata
df_charac_meta = pd.read_table('/content/CMU_Summaries/MovieSummaries/character.metadata.tsv',header=None, names = ['Wikipedia movie ID', 'Freebase movie ID','Movie release date','Character name','Actor date of birth','Actor gender','Actor height','Actor ethnicity','Actor name','Actor age at movie release','Freebase character/actor map ID','Freebase character ID','Freebase actor ID'])

In [None]:
df_charac_meta

In [None]:
# Reading the summaries from a text file
# Can directly load the summaries into a dataframe - done in CMU Books dataset
with open('/content/CMU_Summaries/MovieSummaries/plot_summaries.txt','r') as f:
  summaries = f.read()

In [None]:
# Splitting different summaries
summaries = summaries.split('\n')

In [None]:
summaries

In [None]:
# Loading summaries in a dataframe
df_summaries = pd.DataFrame(columns=['Wikipedia movie ID', 'Summary'])

In [None]:
df_summaries['Summary'] = summaries

In [None]:
# Functions to separate movie ID from the summary
def sep_id(x):
  return x.split('\t')[0]
def sep_summary(x):
  try:
    return x.split('\t')[1]
  except:
    return None

In [None]:
df_summaries['Wikipedia movie ID']  = df_summaries['Summary'].apply(sep_id)
df_summaries['Summary'] = df_summaries['Summary'].apply(sep_summary)

In [None]:
df_summaries.dropna(inplace=True)

In [None]:
df_summaries.reset_index(inplace=True, drop=True)

In [None]:
# Adding additional column to store different characters of the story
df_summaries = df_summaries.assign(Genre="", Movie_Name="",Characters="")

In [None]:
df_summaries

In [None]:
# Extarcting the movie genre and its title 
labels = []
for idx, row in df_summaries.iterrows():
  # for idx2, row2 in df_movie_meta.iterrows():
    #Get index and then assign values, avoid the for loop and if condition (do the same for characters)
    # if row2['Wikipedia movie ID'] == row['Wikipedia movie ID']:
      # df_summaries['Movie_Name'].iloc[idx] = row2['Movie name']
      # df_summaries['Genre'].iloc[idx] = row2['Movie genres']
      # break
  try: 
    index = df_movie_meta['Wikipedia movie ID'].values.tolist().index(int(row['Wikipedia movie ID']))
    df_summaries['Movie_Name'].iloc[idx] = df_movie_meta['Movie name'].iloc[index]
    df_summaries['Genre'].iloc[idx] = df_movie_meta['Movie genres'].iloc[index]
  except:
    labels.append(idx)
df_summaries.drop(index=labels, axis=0, inplace=True)
df_summaries.reset_index(drop=True, inplace=True)

In [None]:
# Extract Genre 
import json
def get_genre(x):
  return ', '.join(json.loads(x).values())
df_summaries['Genre'] = df_summaries['Genre'].apply(get_genre)

In [None]:
# Extraction of different characters of a movie from the metadata
# Using BookNLP for this task, as current metadata skips a lot of characters from different stories
labels=[]
for idx, row in df_summaries.iterrows():
  index=-1
  check = True
  # for idx2, row2 in df_charac_meta.iterrows():
  #   if row2['Wikipedia movie ID'] == row['Wikipedia movie ID']:
  #     df_summaries['Characters'].iloc[idx] = df_summaries['Characters'].iloc[idx]+row2['Character name']
  print()
  print(row['Movie_Name'], end=' ')
  try:  
    while True:
      index = df_charac_meta['Wikipedia movie ID'].values.tolist()[index+1:].index(int(row['Wikipedia movie ID']))+index+1
      df_summaries['Characters'].iloc[idx] = df_summaries['Characters'].iloc[idx] + ", " + df_charac_meta['Character name'].iloc[index]
      print(df_charac_meta['Character name'].iloc[index], end=' ')
      check = False
  except:
    if check:
      labels.append(idx)
df_summaries.drop(['Characters'], axis=1, inplace=True)
df_summaries = df_summaries.assign(Characters="")

In [None]:
# Save dataframe
df_summaries.to_csv('/content/drive/MyDrive/Visual Story Telling/Dataset - Story Generation/CMU_Movie_Dataset', index=False)

### CMU Book Summaries

In [None]:
# Unzipping files from the folder
import tarfile
my_tar = tarfile.open('/content/drive/MyDrive/Visual Story Telling/Dataset - Story Generation/booksummaries.tar.gz')
my_tar.extractall('./CMU_Book_Summaries')
my_tar.close()

In [None]:
# Readme file for the file description and content
with open('/content/CMU_Book_Summaries/booksummaries/README','r') as f:
  readme_data = f.read()
print(readme_data)

In [None]:
# Loading the summaries
columns = ['Wikipedia article ID','Freebase ID','Book title','Author','Publication date','Book genres','Plot summary']
df_summaries = pd.read_table('/content/CMU_Book_Summaries/booksummaries/booksummaries.txt', header=None, names=columns)

In [None]:
# Retaining only the useful columns and dropping rest of them
df_summaries.drop(['Freebase ID','Author','Publication date'], axis=1, inplace=True)

In [None]:
df_summaries.dropna(axis=0, inplace=True)

In [None]:
# Genre extarction
import json
def extract_genres(x):
  return ", ".join(json.loads(x).values())

In [None]:
df_summaries['Book genres'] = df_summaries['Book genres'].apply(extract_genres)

In [None]:
# Saving the extracted dataset
df_summaries.to_csv('/content/drive/MyDrive/Visual Story Telling/Dataset - Story Generation/CMU_Book_Dataset', index=False)