In [1]:
import pandas as pd
import json
import re
import pprint as pp
from pymongo import MongoClient
import seaborn as sns
from pandas.io.json import json_normalize
from sklearn.preprocessing import MultiLabelBinarizer
from sqlalchemy import create_engine
import dask.dataframe as dd

In [2]:
client = MongoClient('localhost', 27017)
db = client.MovieRecommender
mbl = MultiLabelBinarizer()

In [3]:
keywords = db['keywords']
keywords_df = pd.DataFrame(list((keywords).find({})))
del keywords

In [13]:
metadata = db['movies_metadata']
metadata_df = pd.DataFrame(list((metadata).find({})))
del metadata

In [5]:
"""Create Dummy column and variables for Keywords and Genres"""

keywords_df['keyword_names_list'] = keywords_df['keywords'].apply(lambda z: [x['name'] for x in z])
mbl.fit(keywords_df['keyword_names_list'])
keyword_dummies = pd.DataFrame(mbl.transform(keywords_df['keyword_names_list']), columns=mbl.classes_)
keywords_df_long = keywords_df.merge(keyword_dummies, left_index=True, right_index=True)
del keyword_dummies
del keywords_df

In [9]:
keywords_df_long.drop(['keywords','keyword_names_list'],inplace=True, axis=1)
keywords_df_long.drop('_id', axis=1, inplace=True)

In [14]:
metadata_df['genres_list'] = metadata_df['genres'].apply(lambda z: [x['name'] for x in z])
mbl.fit(metadata_df['genres_list'])
genres_binary_df = pd.DataFrame(mbl.transform(metadata_df['genres_list']), columns=mbl.classes_)
metadata_df_long = metadata_df.merge(genres_binary_df, left_index=True, right_index=True)
del genres_binary_df
del metadata_df

In [17]:
metadata_df_long.drop(['genres', 'genres_list'], axis=1, inplace=True)

In [18]:
keywords_dd = dd.from_pandas(keywords_df_long, chunksize=8000)

In [19]:
del keywords_df_long

In [20]:
metadata_dd = dd.from_pandas(metadata_df_long, chunksize=8000)

In [21]:
del metadata_df_long

#### At this point both metadata and keywords are loaded into Dask

In [22]:
ratings_dd = dd.read_csv('../data/the-movies-dataset/ratings.csv')

In [25]:
ratings_dd.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [26]:
metadata_dd.head()

Unnamed: 0,_id,_omdb_id,adult,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,...,Romance,Science Fiction,Sentai Filmworks,TV Movie,Telescene Film Group Productions,The Cartel,Thriller,Vision View Entertainment,War,Western
0,5c83f01efc0c4b2eee120d07,862,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,tt0114709,en,Toy Story,...,0,0,0,0,0,0,0,0,0,0
1,5c83f01efc0c4b2eee120d08,8844,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,tt0113497,en,Jumanji,...,0,0,0,0,0,0,0,0,0,0
2,5c83f01efc0c4b2eee120d09,15602,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,tt0113228,en,Grumpier Old Men,...,1,0,0,0,0,0,0,0,0,0
3,5c83f01efc0c4b2eee120d0a,31357,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0114885,en,Waiting to Exhale,...,1,0,0,0,0,0,0,0,0,0
4,5c83f01efc0c4b2eee120d0b,11862,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,tt0113041,en,Father of the Bride Part II,...,0,0,0,0,0,0,0,0,0,0


In [27]:
keywords_dd.head()

Unnamed: 0,_id,_omdb_id,keywords,keyword_names_list,'comfort women',077,10th century,1500s,15th birthday,15th century,...,부러진 화살,소원,알투비 : 리턴투베이스,오싹한 연애,위험한 소문,찌라시,찌라시 : 위험한 소문,카운트다운,하울링,형사 duelist
0,5c8300fdfc0c4b2eee10930d,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[jealousy, toy, boy, friendship, friends, riva...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5c8300fdfc0c4b2eee10930e,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","[board game, disappearance, based on children'...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5c8300fdfc0c4b2eee10930f,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[fishing, best friend, duringcreditsstinger, o...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5c8300fdfc0c4b2eee109310,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':...","[based on novel, interracial relationship, sin...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5c8300fdfc0c4b2eee109311,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[baby, midlife crisis, confidence, aging, daug...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Rename the movies Id Coumn _omdb_id
ratings_df = ratings_dd.rename(columns={'movieId': '_omdb_id'})
print(type(ratings_df))

# Merge all three samples

<class 'dask.dataframe.core.DataFrame'>


In [24]:
# Get a sample from each Table
ratings_sample = ratings_df.get_partition(1)
metadata_sample = metadata_dd.get_partition(1)
keywords_sample = keywords_dd.get_partition(1)

In [25]:
ratings_sample['_omdb_id'] = ratings_sample._omdb_id.astype('int64') 

In [26]:
metadata_sample['_omdb_id'] = metadata_sample._omdb_id.astype('int64')

In [27]:
ratings_and_meta_samples = ratings_sample.merge(metadata_sample, on='_omdb_id', how='inner')

In [28]:
rating_meta_keyword_sample = ratings_and_meta_samples.merge(keywords_sample, on='_omdb_id', how='inner')

In [29]:
shape = rating_meta_keyword_sample.shape

In [30]:
shape

(Delayed('int-a663e199-6697-4433-aa32-9ec763c99c8f'), 20015)

In [36]:
rating_meta_keyword_sample.columns

Index(['userId', '_omdb_id', 'rating', 'timestamp', '_id', 'adult_x',
       'belongs_to_collection', 'budget', 'homepage_x', 'imdb_id',
       ...
       '부러진 화살', '소원', '알투비 : 리턴투베이스', '오싹한 연애', '위험한 소문', '찌라시',
       '찌라시 : 위험한 소문', '카운트다운', '하울링', '형사 duelist'],
      dtype='object', length=20015)

In [38]:
#rating_meta_keyword_sample_dropped = rating_meta_keyword_sample.drop('_id_x', axis=1)
rating_meta_keyword_sample_dropped = rating_meta_keyword_sample.drop('_id', axis=1)


In [None]:
rating_meta_keyword_sample_dropped.to_parquet('../data/parquets/ratings_metadata_keywords_sample.pq')

In [None]:
rating_meta_keyword_sample_dropped.to_csv('../data/ratings_metadata_keywords_sample.csv')