In [14]:
import pandas as pd
import seaborn as sns
import json
import numpy as np
import os
from datetime import datetime

sns.set(style='whitegrid')
movies_df = pd.read_csv('output/tmdb_5000_cleaned.csv')

In [2]:
# extract id from json before compute similarity, return a list
def extract_json(df, label, row):
    ids = []
    if df.loc[row, label] != None:
        items = json.loads(df.loc[row, label])
    else:
        return ids
    for item in items:
        # for production_countries, not numeric ids
        if label == 'production_countries':
            ids.append(item['iso_3166_1'])
        elif label == 'spoken_languages':
            ids.append(item['iso_639_1'])
        else:
            ids.append(item['id'])
    return ids

##### discarded the text columns like title and  overview

In [17]:
movies_df = movies_df.drop(columns = ['title', 'original_title', 'original_language', 'overview'])
# test data
movies_df = movies_df.head(100)

##### 4 types of data in this dataset:
1. ID
2. Numeric: col = ['budget', 'popularity', 'revenue', 'runtime', 'vote_average', 'vote_count']
3. Json: col = ['cast', 'crew', 'genres', 'keywords', 'production_companies', 'production_countries', 'spoken_languages']
4. Date: col = ['release_date']
##### how to compute the similarity?
1. Numeric: The number
2. Json: by counting the number of same elements in one dimension
3. Release Date: by substracting the date
After getting numbers, it is needed to normalize the dataset.

In [4]:
num_col = ['budget', 'popularity', 'revenue', 'runtime', 'vote_average', 'vote_count']
json_col = ['cast', 'crew', 'genres', 'keywords', 'production_companies', 'production_countries', 'spoken_languages']
date_col = ['release_date']

columns = {}
for i in num_col:
    columns[i] = 'numeric'
for i in json_col:
    columns[i] = 'json'
for i in date_col:
    columns[i] = 'date'
for col in columns:
    print(col)

budget
popularity
revenue
runtime
vote_average
vote_count
cast
crew
genres
keywords
production_companies
production_countries
spoken_languages
release_date


##### How to normalize the similarity?

In [5]:
# Build similarity matrix
# normalize the similarity value of each pair by 1-(A-B)/range(col) or A.intersection(B)/range(len(one cell))
# {col: type of col}
columns = {'budget': 'numeric',
 'popularity': 'numeric',
 'revenue': 'numeric',
 'runtime': 'numeric',
 'vote_average': 'numeric',
 'vote_count': 'numeric',
 'cast': 'json',
 'crew': 'json',
 'genres': 'json',
 'keywords': 'json',
 'production_companies': 'json',
 'production_countries': 'json',
 'spoken_languages': 'json',
 'release_date': 'date'}


# initialize similarity matrix
simatrix = np.full((len(movies_df), len(movies_df)), None)

# compute normalize base (range) for each column {column: denom}
denom = {}
for col in columns.keys():
    tag = columns[col]
    if tag == 'numeric':
        denom[col] = movies_df[col].max() - movies_df[col].min()
    if tag == 'json':
        json_len = []
        for i in range(len(movies_df)):
            json_len.append(len(extract_json(movies_df, col, i)))
        # compute range of len of the json object
        denom[col] = max(json_len) - min(json_len)
    if tag == 'date':
        # compute the range of the date
        max_date = datetime.strptime(movies_df[col].max(), '%Y-%m-%d').date()
        min_date = datetime.strptime(movies_df[col].min(), '%Y-%m-%d').date()
        denom[col] = (max_date - min_date).days
denom            

{'budget': 365000000,
 'popularity': 721.829249,
 'revenue': 2787965087,
 'runtime': 103.0,
 'vote_average': 3.3999999999999995,
 'vote_count': 13718,
 'cast': 149,
 'crew': 429,
 'genres': 5,
 'keywords': 31,
 'production_companies': 10,
 'production_countries': 3,
 'spoken_languages': 8,
 'release_date': 7676}

In [6]:
# compute range(col) or avg(len(one cell))
def comp_similarity(df, row_i, row_j, col):
    tag = columns[col]
    # for numeric columns
    if tag == 'numeric':
        # get row_i and row_j
        num_i = movies_df.loc[row_i, col]
        num_j = movies_df.loc[row_j, col]
        # return normalized similarity
        return 1 - abs(num_i - num_j)/denom[col]
        
    # for json column
    if tag == 'json':
        # extract row1
        id_i = extract_json(movies_df, col, row_i)
        # extract row2
        id_j = extract_json(movies_df, col, row_j)
        # compute similarity of this feature
        sim_json = len(set(id_i).intersection(set(id_j)))
        # return normalized similarity
        return sim_json/denom[col]
    
    # for date column
    if tag == 'date':
        # get row_i and row_j
        date_i = datetime.strptime(movies_df.loc[row_i, col], '%Y-%m-%d').date()
        date_j = datetime.strptime(movies_df.loc[row_j, col], '%Y-%m-%d').date()
        # return normalized similarity
        return 1 - abs((date_i - date_j).days)/denom[col]

In [25]:
# compute matrix
# for every combination of rows
for i in range(len(movies_df)-1):
    for j in range(i+1, len(movies_df)):
        sim_all  = 0
        # for every feature
        for col in columns.keys():
            # compute normalized similarity of this feature
            sim_col = comp_similarity(movies_df, i, j, col)
            # combine similarity from each feature (now using add, can consider more complicated methods)
            sim_all += sim_col          
        # put sim_all into matrix
        simatrix[i][j] = sim_all

# dump the matrix into a file
pd.DataFrame(simatrix).to_csv("output/simatrix100.csv")
