In [1]:
!apt install subversion
import pandas as pd
import unicodedata as ud
import numpy as np
import os
import re

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  libapr1 libaprutil1 libserf-1-1 libsvn1
Suggested packages:
  db5.3-util libapache2-mod-svn subversion-tools
The following NEW packages will be installed:
  libapr1 libaprutil1 libserf-1-1 libsvn1 subversion
0 upgraded, 5 newly installed, 0 to remove and 34 not upgraded.
Need to get 2,237 kB of archives.
After this operation, 9,910 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 libapr1 amd64 1.6.3-2 [90.9 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 libaprutil1 amd64 1.6.1-2 [84.4 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libserf-1-1 amd64 1.3.9-6 [44.4 kB]
Get:4 http://archive.ubuntu.com/ubuntu bionic/universe amd6

## 1. Download **datasets**

In [2]:
# Datasets are downloaded from our project's repository
# SVN let us download the folder we are interested in
!svn checkout https://github.com/Alicia6N/movie_generator/trunk/datasets 

A    datasets/anime_clean.csv
A    datasets/merged_dataset_descriptions.csv
A    datasets/merged_dataset_titles.csv
A    datasets/movies_metadata.csv
A    datasets/netflix_titles.csv
A    datasets/tmdb_5000_movies.csv
Checked out revision 5.


## 2. Load and Clean Raw Datasets

In [None]:
# Read .csv's, replace column names and remove NaN data.
movie_data = pd.read_csv('datasets/movies_metadata.csv')
movie_data = movie_data[['original_title', 'overview', 'genres', 'release_date']]
movie_data.columns = ['title', 'overview', 'genre', 'year']
movie_data = movie_data.dropna()
print('movies_metadata shape: ', movie_data.shape)

netflix_data = pd.read_csv('datasets/netflix_titles.csv')
netflix_data = netflix_data[['title', 'description', 'listed_in', 'release_year']]
netflix_data.columns = ['title', 'overview', 'genre', 'year']
netflix_data = netflix_data.dropna()
print('netflix_titles shape: ', netflix_data.shape)

imdb_data = pd.read_csv('datasets/tmdb_5000_movies.csv')
imdb_data = imdb_data[['title', 'overview', 'genres', 'release_date']]
imdb_data.columns = ['title', 'overview', 'genre', 'year']
imdb_data = imdb_data.dropna()
print('imdb_5000_movies shape: ', imdb_data.shape)

anime_data = pd.read_csv('datasets/anime_clean.csv')
anime_data = anime_data[['Title', 'Description', 'Genres', 'Start airing']]
anime_data.columns = ['title', 'overview', 'genre', 'year']
anime_data = anime_data.dropna()
print('anime_clean shape: ', anime_data.shape)

def remove_long_spaces(df):
    clean_dataset = df.copy()
    
    for index, row in df.iterrows():
        aux_title = df.at[index, 'title']
        aux_desc = df.at[index, 'overview']

        aux_title = ' '.join(aux_title.split())
        aux_desc = ' '.join(aux_desc.split())

        clean_dataset.at[index, 'title'] = aux_title
        clean_dataset.at[index, 'overview'] = aux_desc

    return df

movie_data = remove_long_spaces(movie_data).copy()
netflix_data = remove_long_spaces(netflix_data).copy()
imdb_data = remove_long_spaces(imdb_data).copy()
anime_data = remove_long_spaces(anime_data).copy()

  interactivity=interactivity, compiler=compiler, result=result)


movies_metadata shape:  (44438, 4)
netflix_titles shape:  (7787, 4)
imdb_5000_movies shape:  (4799, 4)
anime_clean shape:  (1546, 4)


In [None]:
movie_data.head()

Unnamed: 0,title,overview,genre,year
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30
1,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15
2,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22
4,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10


In [None]:
netflix_data.head()

Unnamed: 0,title,overview,genre,year
0,3%,In a future where the elite inhabit an island ...,"International TV Shows, TV Dramas, TV Sci-Fi &...",2020
1,7:19,After a devastating earthquake hits Mexico Cit...,"Dramas, International Movies",2016
2,23:59,"When an army recruit is found dead, his fellow...","Horror Movies, International Movies",2011
3,9,"In a postapocalyptic world, rag-doll robots hi...","Action & Adventure, Independent Movies, Sci-Fi...",2009
4,21,A brilliant group of students become card-coun...,Dramas,2008


In [None]:
imdb_data.head()

Unnamed: 0,title,overview,genre,year
0,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",2009-12-10
1,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",2007-05-19
2,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",2015-10-26
3,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",2012-07-16
4,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",2012-03-07


In [None]:
anime_data.head()

Unnamed: 0,title,overview,genre,year
0,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","Action,Military,Adventure,Comedy,Drama,Magic,F...",05/04/2009
1,Kimi no Na wa.,"Mitsuha Miyamizu, a high school girl, yearns t...","Supernatural,Drama,Romance,School",26/08/2016
2,Gintama°,"Gintoki, Shinpachi, and Kagura return as the f...","Action,Comedy,Historical,Parody,Samurai,Sci-Fi...",08/04/2015
3,Steins;Gate 0,The dark untold story of Steins;Gate that lead...,"Sci-Fi,Thriller",12/04/2018
4,Steins;Gate,The self-proclaimed mad scientist Rintarou Oka...,"Sci-Fi,Thriller",06/04/2011


## 3. Preprocess Datasets

In [None]:
# Set of auxiliary functions 

def is_latin(uchr):
    latin_letters = {}
    try: return latin_letters[uchr]
    except KeyError:
        return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr))

def only_roman_chars(unistr):
    return all(is_latin(uchr)
        for uchr in unistr
        if uchr.isalpha())

def filter_title(movie, dataset, titles):
    lowered_title = movie['title'].lower()
    if not only_roman_chars(lowered_title):
        return False 
    if lowered_title in titles:
        dup_index = titles.index(lowered_title)
        dup_movie = dataset[dup_index]
        if str(movie['year']) == dup_movie[3]:
            return False
    return True

def filter_overview(overview):
    return False if 'no overview' in overview else True

def normalize_year(movie):
    year = str(movie['year'])
    if year.find("-") != -1:
        movie['year'] = year[:year.find("-")]
    return movie['year']

def minimum_length(overview):
    return True if len(overview.split()) >= 30 else False

def return_clean_dataset(dataset, movies, titles):
    count = 0
    for iter, row in dataset.iterrows():
        row['year'] = normalize_year(row)
        overview = row['overview'].lower()
        if filter_title(row, movies, titles) and filter_overview(overview) and minimum_length(overview):
            movies.append([row['title'], row['overview'], row['genre'], row['year']])
            titles.append(row['title'].lower())
        else:
            count += 1
    print('Removed: ', count)
    print('Updated clean dataset size: ', len(movies))
    return movies, titles

In [None]:
merged_dataset_filename = 'datasets/merged_dataset_descriptions.csv'
if not os.path.isfile(merged_dataset_filename):
    movies = []
    titles = []
    print('--- Cleaning movie_data dataset...')
    movies, titles = return_clean_dataset(movie_data, movies, titles)       # movie_data clean dataset
    print('--- Cleaning neftlix_data dataset...')
    movies, titles = return_clean_dataset(netflix_data, movies, titles)     # neftlix_data clean dataset
    print('--- Cleaning imdb_data dataset...')
    movies, titles = return_clean_dataset(imdb_data, movies, titles)        # imdb_data clean dataset
    print('--- Cleaning anime_data dataset...')
    movies, titles = return_clean_dataset(anime_data, movies, titles)        # imdb_data clean dataset

    # Merge all datasets and save to csv
    merged_dataset = pd.DataFrame(np.array(movies)[:,:2], columns=['title', 'overview'])
    merged_dataset.to_csv(merged_dataset_filename, index=False)
    print('--- Saved to ', merged_dataset_filename)
else:
    print('- Dataset \'merged_dataset_descriptions.csv\' already downloaded, skipped Preprocessing stage.')

--- Cleaning movie_data dataset...
Removed:  15203
Updated clean dataset size:  29235
--- Cleaning neftlix_data dataset...
Removed:  7697
Updated clean dataset size:  29325
--- Cleaning imdb_data dataset...
Removed:  4342
Updated clean dataset size:  29782
--- Cleaning anime_data dataset...
Removed:  244
Updated clean dataset size:  31084
--- Saved to  datasets/merged_dataset_descriptions.csv


In [None]:
merged_dataset_filename = 'datasets/merged_dataset_titles.csv'
if not os.path.isfile(merged_dataset_filename):
    movies = []
    titles = []
    print('--- Cleaning movie_data dataset...')
    movies, titles = return_clean_dataset(movie_data, movies, titles)       # movie_data clean dataset
    print('--- Cleaning neftlix_data dataset...')
    movies, titles = return_clean_dataset(netflix_data, movies, titles)     # neftlix_data clean dataset
    print('--- Cleaning imdb_data dataset...')
    movies, titles = return_clean_dataset(imdb_data, movies, titles)        # imdb_data clean dataset

    # Merge all datasets and save to csv
    merged_dataset = pd.DataFrame(np.array(movies)[:,:2], columns=['title', 'overview'])
    merged_dataset.to_csv(merged_dataset_filename, index=False)
    print('--- Saved to ', merged_dataset_filename)
else:
    print('- Dataset \'merged_dataset_descriptions_titles.csv\' already downloaded, skipped Preprocessing stage.')

--- Cleaning movie_data dataset...
Removed:  15203
Updated clean dataset size:  29235
--- Cleaning neftlix_data dataset...
Removed:  7697
Updated clean dataset size:  29325
--- Cleaning imdb_data dataset...
Removed:  4342
Updated clean dataset size:  29782
--- Saved to  datasets/merged_dataset_titles.csv


## Example

In [None]:
dataset = pd.read_csv('datasets/merged_dataset_descriptions.csv')
# Let's see an example of our dataset.
print("Title: ", dataset['title'][1])
print("Overview: ", dataset['overview'][1])
print("-----")
print("Title: ", dataset['title'][2])
print("Overview: ", dataset['overview'][2])

Title:  Jumanji
Overview:  When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures.
-----
Title:  Grumpier Old Men
Overview:  A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming the locals who worry she'll scare the fish away. But she's less interested in seafood than she is in cooking up a hot time with Max.
