In [25]:
import os 
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import precision_recall_fscore_support

In [26]:
data_path = os.path.join(os.getcwd(),'data')

## Load original data 

In [97]:
# import data
data = pd.read_json(os.path.join(data_path, 'supreme_movie_dataset.json'))

## - Or load clean_data

In [70]:
clean_data = pd.read_json(os.path.join(data_path, 'clean_data.json'))

## - Or load clean binarized data

In [148]:
clean_data = pd.read_json(os.path.join(data_path, 'clean_data_binarized.json'))

## Ananlysing Original Data

In [98]:
# quick look at the movies dataset
data.head()

Unnamed: 0,imdb.actors,imdb.budget,imdb.budgetCurrency,imdb.color,imdb.country,imdb.coverLink,imdb.directors,imdb.genres,imdb.imdbLink,imdb.originalLanguage,...,tmdb.reviews,tmdb.runtime,tmdb.similar,tmdb.spoken_languages,tmdb.status,tmdb.tagline,tmdb.title,tmdb.video,tmdb.vote_average,tmdb.vote_count
0,"[Stars: Peter Scanavino, Jason Robards III, An...",0,,Color,USA,http://ia.media-imdb.com/images/M/MV5BMjI3NTE5...,[Vladan Nikolic],"[Sci-Fi, Thriller]",http://www.imdb.com/title/tt1363468/,English,...,[],93.0,"[17654, 51828, 41630, 200727, 38, 18239, 388, ...","[{'name': 'English', 'iso_639_1': 'en'}]",Released,The Film They Don't Want You To See,Zenith,0.0,5.1,9.0
1,"[Mark Albrow, Stefan Alt, Chris Carter]",0,,Color,France,,"[Travis Collins, Amélie Ravalec]","[Documentary, History, Music]",http://www.imdb.com/title/tt3606298/,English,...,[],52.0,"[450794, 20575, 786, 525, 468775, 468769, 1584...","[{'name': 'English', 'iso_639_1': 'en'}]",Released,,Industrial Soundtrack for the Urban Decay,0.0,7.7,3.0
10,"[Crispin Glover, Allisyn Ashley Arm, Terry Moore]",0,,Color,USA,,[Hooroo Jackson],"[Drama, Fantasy, Sci-Fi, Thriller]",http://www.imdb.com/title/tt3550078/,English,...,[],75.0,"[376290, 15472, 293768, 297762, 390582, 70, 50...","[{'name': 'English', 'iso_639_1': 'en'}]",Released,,Aimy in a Cage,0.0,5.1,9.0
100,"[Stars: Grete Havnesköld, Linn Gloppestad, Mar...",0,,Color,Sweden,,[Johanna Hald],[Family],http://www.imdb.com/title/tt0107444/,Swedish,...,[],79.0,"[394117, 49519, 93456, 258480, 17130, 5123, 39...","[{'name': 'svenska', 'iso_639_1': 'sv'}]",Released,,Lotta Leaves Home,0.0,5.8,5.0
1000,"[Dario Yazbek Bernal, Marimar Vega, José María...",0,,Color,Mexico,https://ia.media-imdb.com/images/M/MV5BMjIzMjU...,[Michel Franco],"[Drama, Thriller]",http://www.imdb.com/title/tt1392888/,Spanish,...,[],90.0,"[8067, 4771, 984, 2149, 11826, 80752, 562, 284...","[{'name': 'English', 'iso_639_1': 'en'}, {'nam...",Released,,Daniel & Ana,0.0,5.9,11.0


#### All columns

In [99]:
list(data)

['imdb.actors',
 'imdb.budget',
 'imdb.budgetCurrency',
 'imdb.color',
 'imdb.country',
 'imdb.coverLink',
 'imdb.directors',
 'imdb.genres',
 'imdb.imdbLink',
 'imdb.originalLanguage',
 'imdb.productionCompanies',
 'imdb.reviews',
 'imdb.runtime',
 'imdb.summaries',
 'imdb.synopsis',
 'imdb.writers',
 'movielens.actors',
 'movielens.avgRating',
 'movielens.directors',
 'movielens.dvdReleaseDate',
 'movielens.genres',
 'movielens.imdbMovieId',
 'movielens.languages',
 'movielens.movieId',
 'movielens.mpaa',
 'movielens.numRatings',
 'movielens.originalTitle',
 'movielens.plotSummary',
 'movielens.posterPath',
 'movielens.releaseDate',
 'movielens.releaseYear',
 'movielens.runtime',
 'movielens.title',
 'movielens.tmdbMovieId',
 'movielens.youtubeTrailerIds',
 'movielensId',
 'tmdb.adult',
 'tmdb.backdrop_path',
 'tmdb.belongs_to_collection',
 'tmdb.belongs_to_collection.backdrop_path',
 'tmdb.belongs_to_collection.id',
 'tmdb.belongs_to_collection.name',
 'tmdb.belongs_to_collection.po

#### Drop unimportant columns

In [100]:
clean_data = data.drop(['imdb.budget', 'imdb.budgetCurrency', 'imdb.color', 'imdb.coverLink',
                        'imdb.imdbLink', 'imdb.reviews','movielens.imdbMovieId', 'imdb.synopsis',
                        'movielens.posterPath', 'tmdb.reviews', 'tmdb.status', 'tmdb.tagline', 
                        'tmdb.video', 'tmdb.backdrop_path', 'tmdb.belongs_to_collection', 
                        'tmdb.belongs_to_collection.backdrop_path','tmdb.belongs_to_collection.id', 
                        'tmdb.belongs_to_collection.name','tmdb.belongs_to_collection.poster_path', 
                        'movielens.dvdReleaseDate', 'movielens.mpaa','tmdb.poster_path', 'tmdb.homepage', 
                        'tmdb.revenue', 'tmdb.runtime', 'imdb.directors', 'movielens.releaseDate',
                        'movielens.runtime', 'movielens.originalTitle','imdb.originalLanguage',  
                        'imdb.summaries', 'tmdb.genres', 'tmdb.credits.crew','tmdb.credits.cast',
                        'tmdb.overview','movielens.youtubeTrailerIds','imdb.country','movielens.genres',
                        'movielens.plotSummary','tmdb.production_companies','tmdb.release_date',
                        'movielens.actors',
                        
                       ], axis=1)

#### Remove rows 403 rows with missing tmdb values

In [101]:
clean_data = clean_data[clean_data['tmdb.adult'].notnull()]

#### Store data in new json file

In [102]:
clean_data.to_json(os.path.join(data_path, 'clean_data.json'))

#### Number of remaining rows 

In [149]:
clean_data.shape[0]

24151

#### Count null fields for each column (Now there are no null fields anymore)

In [150]:
null_columns=clean_data.columns[clean_data.isnull().any()]
clean_data[null_columns].isnull().sum()

Series([], dtype: float64)

#### Remaining features

In [151]:
list(clean_data)

['imdb.actors',
 'imdb.genres',
 'imdb.productionCompanies',
 'imdb.runtime',
 'imdb.writers',
 'movielens.avgRating',
 'movielens.directors',
 'movielens.languages',
 'movielens.movieId',
 'movielens.numRatings',
 'movielens.releaseYear',
 'movielens.title',
 'movielens.tmdbMovieId',
 'movielensId',
 'tmdb.adult',
 'tmdb.budget',
 'tmdb.id',
 'tmdb.imdb_id',
 'tmdb.keywords',
 'tmdb.original_language',
 'tmdb.original_title',
 'tmdb.popularity',
 'tmdb.production_countries',
 'tmdb.recommendations',
 'tmdb.similar',
 'tmdb.spoken_languages',
 'tmdb.title',
 'tmdb.vote_average',
 'tmdb.vote_count',
 'Action',
 'Adult',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Short',
 'Sport',
 'Talk-Show',
 'Thriller',
 'War',
 'Western',
 'Adam Sandler',
 'Alexandra Daddario',
 'Amber Heard',
 'Amy Adams',
 'Anna

#### Binarize nominal values

In [106]:
def genre_one_hot(movie_dataset, column): 
    'Returns the given movie dataset with a transformed to one-hot encoded genre column.'

    # MultiLabelBinarizer is a very fast solution for one hot encoding on large dataframes
    mlb = MultiLabelBinarizer()
    one_hot_genre = pd.DataFrame(mlb.fit_transform(movie_dataset[column]),
                                 columns=mlb.classes_, 
                                 index=movie_dataset.index)

    movie_dataset = pd.concat([movie_dataset, one_hot_genre], sort=False, axis=1)
    movie_dataset.drop(column, axis=1)
    
    return movie_dataset

#### Drop unpopoluar columns

In [107]:
def drop_unpopular_columns(movies, start_index, popular_file, top):   
    popular_list = pd.read_json(os.path.join(data_path, popular_file))
    popular_list = popular_list.head(top)
    
    unpop_columns = movies.drop(movies.columns[:(1+start_index)], axis=1)
    
    #drop popular columns 
    for k, v in popular_list.iterrows():
        if v['name'] in movies.columns: 
            unpop_columns.drop([v['name']], axis=1, inplace=True, errors="ignore")
    
    return movies.drop(unpop_columns, axis=1)

#### Start with binarization and removing unpopular columns

In [108]:
#reset index
clean_data = clean_data.reset_index(drop=True)
print('dimension of data: ', clean_data.shape[1])

print('binarizing imdb.genres...')
clean_data = genre_one_hot(clean_data, 'imdb.genres')
print('new dimension of data: ', clean_data.shape[1])

print('binarizing imdb.actors...')
clean_data = genre_one_hot(clean_data, 'imdb.actors')
print('new dimension of data: ', clean_data.shape[1])

print('droping unpopular actors...')
clean_data = drop_unpopular_columns(clean_data, clean_data.columns.get_loc('Western'), 'popular_actors.json', top=100)
print('new dimension of data: ', clean_data.shape[1])

print('binarizing imdb.directors')
clean_data = genre_one_hot(clean_data, 'movielens.directors')
print('new dimension of data: ', clean_data.shape[1])

print('droping unpopular directors...')
clean_data = drop_unpopular_columns(clean_data, clean_data.columns.get_loc('Zoe Saldana'), 'top_directors.json', top=10)
print('new dimension of data: ', clean_data.shape[1])

print('binarizing movielens.languages')
clean_data = genre_one_hot(clean_data, 'movielens.languages')
print('new dimension of data: ', clean_data.shape[1])

dimension of data:  29
binarizing imdb.genres...
new dimension of data:  56
binarizing imdb.actors...
new dimension of data:  37025
droping unpopular actors...
new dimension of data:  145
binarizing imdb.directors
new dimension of data:  15108
droping unpopular directors...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


new dimension of data:  165
binarizing movielens.languages
new dimension of data:  236


In [109]:
clean_data.head()

Unnamed: 0,imdb.actors,imdb.genres,imdb.productionCompanies,imdb.runtime,imdb.writers,movielens.avgRating,movielens.directors,movielens.languages,movielens.movieId,movielens.numRatings,...,বাংলা,ਪੰਜਾਬੀ,தமிழ்,తెలుగు,ภาษาไทย,ქართული,广州话 / 廣州話,日本語,普通话,한국어/조선말
0,"[Stars: Peter Scanavino, Jason Robards III, An...","[Sci-Fi, Thriller]","[Surla Films, Solo Films]",93,[Vladan Nikolic],2.5,[Vladan Nikolić],[English],127292,3,...,0,0,0,0,0,0,0,0,0,0
1,"[Mark Albrow, Stefan Alt, Chris Carter]","[Documentary, History, Music]","[Amelie Ravalec, Les Films du Garage]",52,[Amélie Ravalec],4.0,"[Travis Collins, Amélie Ravalec]",[English],141044,1,...,0,0,0,0,0,0,0,0,0,0
2,"[Crispin Glover, Allisyn Ashley Arm, Terry Moore]","[Drama, Fantasy, Sci-Fi, Thriller]",[Ankaboot Productions],79,"[Hooroo Jackson, Hooroo Jackson]",1.58333,[Hooroo Jackson],[English],150655,6,...,0,0,0,0,0,0,0,0,0,0
3,"[Stars: Grete Havnesköld, Linn Gloppestad, Mar...",[Family],"[Svensk Filmindustri (SF), Astrid Lindgrens Vä...",83,"[Astrid Lindgren, Johanna Hald]",3.25,[Johanna Hald],[svenska],126745,2,...,0,0,0,0,0,0,0,0,0,0
4,"[Dario Yazbek Bernal, Marimar Vega, José María...","[Drama, Thriller]","[Alameda Films, Blu Films, Fidecine]",90,[Michel Franco],3.33333,[Michel Franco],"[English, Español]",152717,6,...,0,0,0,0,0,0,0,0,0,0


#### Store binarized data in csv file

In [146]:
cols=pd.Series(clean_data.columns)

for dup in cols[cols.duplicated()].unique(): 
    cols[cols[cols == dup].index.values.tolist()] = [dup + '.' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
clean_data.columns=cols

clean_data.to_json(os.path.join(data_path, 'clean_data_binarized.json'))

## Estimate missing values with SVC Classifier (Not used for dataset just for trying)

In [40]:
#get rows where the given attribute is null
no_original_language = clean_data[clean_data['tmdb.original_language'].isnull()]
has_original_language = clean_data[clean_data['tmdb.original_language'].notnull()]

#remove rows with missing country value
has_original_language = has_original_language[has_original_language['tmdb.production_countries'].notnull()]

train_data = has_original_language[has_original_language.columns[40:]]
train_label = has_original_language['tmdb.original_language']

In [35]:
print(train_data.shape[0])
print(train_label.shape[0])

24151
24151


In [42]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_label, test_size=0.33, random_state=42)

clf = svm.SVC(gamma='scale')
clf.fit(X_train, y_train)  

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [83]:
y_pred = clf.predict(X_test)
res = pd.DataFrame({'actual':y_test, 'predicted':y_predict.flatten()})

In [85]:
precision_recall_fscore_support(y_test, y_pred, average='weighted')

(0.8660421890417742, 0.8744040150564617, 0.8690980590292386, None)

#### Choose features

In [190]:
samples = data[['movielens.movieId', 'movielens.numRatings']]

#### Create Model

In [191]:
neigh = NearestNeighbors(10)
neigh.fit(samples)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                 radius=1.0)

In [197]:
nearest_neighbors = neigh.kneighbors([[1, 2]], 10, return_distance=False)
nearest_neighbors

array([[20881,  4002, 21560,  3096,  8948, 17210, 19818,  3470, 17089,
        24168]], dtype=int64)