In [1]:
import pandas as pd
import numpy as np

## Fetch data from internet

In [2]:
!rm -rf ./data
!mkdir data

In [3]:
print('Getting movielens data...\n')
!wget http://files.grouplens.org/datasets/movielens/ml-20m.zip -O ./data/ml-20m.zip

Getting movielens data...

--2018-11-13 17:01:36--  http://files.grouplens.org/datasets/movielens/ml-20m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.34.235
Connecting to files.grouplens.org (files.grouplens.org)|128.101.34.235|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 198702078 (189M) [application/zip]
Saving to: ‘./data/ml-20m.zip’


2018-11-13 17:01:44 (25.3 MB/s) - ‘./data/ml-20m.zip’ saved [198702078/198702078]



In [4]:
print('Getting imdb title data...\n')
!wget https://datasets.imdbws.com/title.akas.tsv.gz -O ./data/title.akas.tsv.gz

Getting imdb title data...

--2018-11-13 17:01:45--  https://datasets.imdbws.com/title.akas.tsv.gz
Resolving datasets.imdbws.com (datasets.imdbws.com)... 54.230.8.90, 54.230.8.181, 54.230.8.166, ...
Connecting to datasets.imdbws.com (datasets.imdbws.com)|54.230.8.90|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 54267596 (52M) [text/tab-separated-values]
Saving to: ‘./data/title.akas.tsv.gz’


2018-11-13 17:01:47 (23.5 MB/s) - ‘./data/title.akas.tsv.gz’ saved [54267596/54267596]



In [5]:
print('Getting imdb ratings data...\n')
!wget https://datasets.imdbws.com/title.ratings.tsv.gz -O ./data/title.ratings.tsv.gz

Getting imdb ratings data...

--2018-11-13 17:01:48--  https://datasets.imdbws.com/title.ratings.tsv.gz
Resolving datasets.imdbws.com (datasets.imdbws.com)... 54.230.8.90, 54.230.8.181, 54.230.8.166, ...
Connecting to datasets.imdbws.com (datasets.imdbws.com)|54.230.8.90|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4365124 (4.2M) [binary/octet-stream]
Saving to: ‘./data/title.ratings.tsv.gz’


2018-11-13 17:01:48 (19.3 MB/s) - ‘./data/title.ratings.tsv.gz’ saved [4365124/4365124]



In [6]:
print('Getting imdb title data...\n')
!wget https://datasets.imdbws.com/title.basics.tsv.gz -O ./data/title.basics.tsv.gz
print('Data downloads complete.\n')

Getting imdb title data...

--2018-11-13 17:01:50--  https://datasets.imdbws.com/title.basics.tsv.gz
Resolving datasets.imdbws.com (datasets.imdbws.com)... 54.230.8.90, 54.230.8.181, 54.230.8.166, ...
Connecting to datasets.imdbws.com (datasets.imdbws.com)|54.230.8.90|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 97196768 (93M) [text/tab-separated-values]
Saving to: ‘./data/title.basics.tsv.gz’


2018-11-13 17:01:54 (20.4 MB/s) - ‘./data/title.basics.tsv.gz’ saved [97196768/97196768]

Data downloads complete.



In [10]:
print('Unzipping...')
!unzip -o ./data/ml-20m.zip -d ./data
#!mkdir ./data/title.akas/
#!gunzip -c ./data/title.akas.tsv.gz > ./data/title.akas/title.akas.tsv
!mkdir ./data/title.ratings/
!gunzip -c ./data/title.ratings.tsv.gz > ./data/title.ratings/title.ratings.tsv
!mkdir ./data/title.basics/
!gunzip -c ./data/title.basics.tsv.gz > ./data/title.basics/title.basics.tsv
print('Decompression complete')

Unzipping...
Archive:  ./data/ml-20m.zip
   creating: ./data/ml-20m/
  inflating: ./data/ml-20m/genome-scores.csv  
  inflating: ./data/ml-20m/genome-tags.csv  
  inflating: ./data/ml-20m/links.csv  
  inflating: ./data/ml-20m/movies.csv  
  inflating: ./data/ml-20m/ratings.csv  
  inflating: ./data/ml-20m/README.txt  
  inflating: ./data/ml-20m/tags.csv  
Decompression complete


## Load data into Pandas dataframe

In [96]:
df_movies = pd.read_csv('./data/ml-20m/movies.csv')
df_rating = pd.read_csv('./data/ml-20m/ratings.csv')
df_tags = pd.read_csv('./data/ml-20m/tags.csv')
df_links = pd.read_csv('./data/ml-20m/links.csv')
df_genome_scores = pd.read_csv('./data/ml-20m/genome-scores.csv')
df_genome_tags = pd.read_csv('./data/ml-20m/genome-tags.csv')

In [108]:
imdb_title = pd.read_table('./data/title.akas/title.akas.tsv', dtype={'titleId':str,
                                                                      'ordering':int,
                                                                      'title':str,
                                                                      'region':str,
                                                                      'language':str,
                                                                      'types':str,
                                                                      'attributes':str,
                                                                      'isOriginalTitle':str}, na_values = '\\N')
imdb_ratings = pd.read_table('./data/title.ratings/title.ratings.tsv', na_values = '\\N')
imdb_title_basics = pd.read_table('./data/title.basics/title.basics.tsv', na_values = '\\N', dtype={'tconst':str, 
                                                                                                    'titleType':str, 
                                                                                                    'primaryTitle':str, 
                                                                                                    'originalTitle':str, 
                                                                                                    'isAdult':str, 
                                                                                                    'startYear':str, 
                                                                                                    'endYear':str, 
                                                                                                    'runtimeMinutes':str, 
                                                                                                    'genres':str})

## Some preprocessing steps from here

In [109]:
df_rating.timestamp = pd.to_datetime(df_rating.timestamp,unit='s')

In [110]:
df_tags.timestamp = pd.to_datetime(df_tags.timestamp,unit='s')

In [111]:
df_movies['movie_year'] = df_movies.title.str.extract('\(([0-9][0-9][0-9][0-9])\)')
df_movies = pd.concat([df_movies, df_movies.genres.str.get_dummies(sep='|')  ], axis=1)

  if __name__ == '__main__':


#### Format imdb_ratings for joining

In [112]:
imdb_ratings['imdbId'] = pd.to_numeric(imdb_ratings.tconst.str.replace('tt(0)*', ''))
imdb_ratings = imdb_ratings.drop(['tconst'], axis=1)

display(imdb_ratings.head())

Unnamed: 0,averageRating,numVotes,imdbId
0,5.8,1435,1
1,6.3,170,2
2,6.6,1030,3
3,6.4,101,4
4,6.2,1728,5


#### Format imdb_title_basics for joining

In [113]:
imdb_title_basics['imdbId'] = pd.to_numeric(imdb_title_basics.tconst.str.replace('tt(0)*', ''))
imdb_title_basics = imdb_title_basics.drop(['tconst','endYear'], axis=1)
imdb_title_basics['genre_list'] = imdb_title_basics['genres'].str.split(',')
display(imdb_title_basics.head())

Unnamed: 0,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,imdbId,genre_list
0,short,Carmencita,Carmencita,0,1894,1.0,"Documentary,Short",1,"[Documentary, Short]"
1,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,5.0,"Animation,Short",2,"[Animation, Short]"
2,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,4.0,"Animation,Comedy,Romance",3,"[Animation, Comedy, Romance]"
3,short,Un bon bock,Un bon bock,0,1892,,"Animation,Short",4,"[Animation, Short]"
4,short,Blacksmith Scene,Blacksmith Scene,0,1893,1.0,"Comedy,Short",5,"[Comedy, Short]"


### Merge IMDB files
#### imdb_title_basics with df_links

In [114]:
film_database_data = pd.merge(df_links, imdb_title_basics, how='left', on=['imdbId'])

In [115]:
print("Merged dataframe shape: {}".format(film_database_data.shape))
print("Original links shape: {}".format(df_links.shape))

print("\nMerged dataframe:")
film_database_data.head()

Merged dataframe shape: (27278, 11)
Original links shape: (27278, 3)

Merged dataframe:


Unnamed: 0,movieId,imdbId,tmdbId,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,genre_list
0,1,114709,862.0,movie,Toy Story,Toy Story,0,1995,81,"Adventure,Animation,Comedy","[Adventure, Animation, Comedy]"
1,2,113497,8844.0,movie,Jumanji,Jumanji,0,1995,104,"Adventure,Family,Fantasy","[Adventure, Family, Fantasy]"
2,3,113228,15602.0,movie,Grumpier Old Men,Grumpier Old Men,0,1995,101,"Comedy,Romance","[Comedy, Romance]"
3,4,114885,31357.0,movie,Waiting to Exhale,Waiting to Exhale,0,1995,124,"Comedy,Drama,Romance","[Comedy, Drama, Romance]"
4,5,113041,11862.0,movie,Father of the Bride Part II,Father of the Bride Part II,0,1995,106,"Comedy,Family,Romance","[Comedy, Family, Romance]"


#### film_database_data with imdb_ratings

In [125]:
df_imdb_film_data_with_ratings = pd.merge(film_database_data, imdb_ratings, how='left', on=['imdbId'])

In [126]:
df_imdb_film_data_with_ratings.head()

Unnamed: 0,movieId,imdbId,tmdbId,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,genre_list,averageRating,numVotes
0,1,114709,862.0,movie,Toy Story,Toy Story,0,1995,81,"Adventure,Animation,Comedy","[Adventure, Animation, Comedy]",8.3,757728.0
1,2,113497,8844.0,movie,Jumanji,Jumanji,0,1995,104,"Adventure,Family,Fantasy","[Adventure, Family, Fantasy]",6.9,259244.0
2,3,113228,15602.0,movie,Grumpier Old Men,Grumpier Old Men,0,1995,101,"Comedy,Romance","[Comedy, Romance]",6.6,21805.0
3,4,114885,31357.0,movie,Waiting to Exhale,Waiting to Exhale,0,1995,124,"Comedy,Drama,Romance","[Comedy, Drama, Romance]",5.8,8431.0
4,5,113041,11862.0,movie,Father of the Bride Part II,Father of the Bride Part II,0,1995,106,"Comedy,Family,Romance","[Comedy, Family, Romance]",6.0,30324.0


## Save to pickle file

In [119]:
df_movies.to_pickle("./data/movies.pkl")

In [120]:
df_rating.to_pickle("./data/rating.pkl")

In [121]:
df_tags.to_pickle("./data/tags.pkl")

In [122]:
df_links.to_pickle("./data/links.pkl")

In [123]:
df_genome_scores.to_pickle("./data/genome-scores.pkl")

In [124]:
df_genome_tags.to_pickle("./data/genome-tags.pkl")

In [127]:
df_imdb_film_data_with_ratings.to_pickle("./data/imdb-data.pkl")