In [253]:
import pandas as pd
import numpy as np

In [254]:
movies_df = pd.read_csv('ml-latest-small/movies.csv')
links_df = pd.read_csv('ml-latest-small/links.csv', dtype={'imdbId': str, 'tmdbId': str})
links_df

Unnamed: 0,movieId,imdbId,tmdbId
0,1,0114709,862
1,2,0113497,8844
2,3,0113228,15602
3,4,0114885,31357
4,5,0113041,11862
...,...,...,...
9737,193581,5476944,432131
9738,193583,5914996,445030
9739,193585,6397426,479308
9740,193587,8391976,483455


In [255]:
ratings_df = pd.read_csv("ml-latest-small/ratings.csv", sep=",")
ratings_df.columns=["UID","OID","rating","timestamp"]
ratings_df

Unnamed: 0,UID,OID,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [256]:
def munge_title(title):
    i = title.rfind(' (')
    if i != -1:
        title = title[:i]
    for suff_word in ['The', 'A', 'An']:
        suffix = ', {}'.format(suff_word)
        if title.endswith(suffix):
            title = suff_word + ' ' + title[:-len(suffix)]
    return title

def get_unique_genres(movies_df):
    genreList = []
    for index, row in movies_df.iterrows():
        genres = row["genres"]
        
        for genre in genres:
            if genre not in genreList:
                genreList.append(genre)
    return pd.Series(genreList)
  
def get_links(movies_df: pd.DataFrame, links_df):
    movies_df = movies_df.merge(links_df, on='movieId')
    return movies_df["tmdbId"]

In [257]:
movies_df.movieId = movies_df.movieId.astype(int)

movies_df["ratingCount"] = ratings_df.groupby("OID")["UID"].count()
movies_df["ratingCount"] = movies_df["ratingCount"].fillna(0)
movies_df["ratingCount"] = movies_df["ratingCount"].astype(int)

movies_df["year"] = movies_df.title.str.extract(r'\(([0-9]+)\)')
movies_df["year"] = movies_df.year.fillna(0)
movies_df["year"] = movies_df.year.astype(int)

movies_df["genres"] = movies_df["genres"].apply(lambda x: x.lower() if x != "(no genres listed)" else "unknown").str.split("|")

movies_df['title'] = movies_df['title'].map(munge_title)

movies_df["tmdbId"] = get_links(movies_df, links_df)

movies_df

Unnamed: 0,movieId,title,genres,ratingCount,year,tmdbId
0,1,Toy Story,"[adventure, animation, children, comedy, fantasy]",0,1995,862
1,2,Jumanji,"[adventure, children, fantasy]",215,1995,8844
2,3,Grumpier Old Men,"[comedy, romance]",110,1995,15602
3,4,Waiting to Exhale,"[comedy, drama, romance]",52,1995,31357
4,5,Father of the Bride Part II,[comedy],7,1995,11862
...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,"[action, animation, comedy, fantasy]",0,2017,432131
9738,193583,No Game No Life: Zero,"[animation, comedy, fantasy]",0,2017,445030
9739,193585,Flint,[drama],0,2017,479308
9740,193587,Bungo Stray Dogs: Dead Apple,"[action, animation]",0,2018,483455


In [258]:
unique_genres = get_unique_genres(movies_df)
unique_genres

0       adventure
1       animation
2        children
3          comedy
4         fantasy
5         romance
6           drama
7          action
8           crime
9        thriller
10         horror
11        mystery
12         sci-fi
13            war
14        musical
15    documentary
16           imax
17        western
18      film-noir
19        unknown
dtype: object

In [259]:
movies_df.to_json("movies.json", orient="records")
unique_genres.to_json("genres.json", orient="records")