In [8]:
import os 
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import precision_recall_fscore_support
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

In [5]:
data_path = os.path.join(os.getcwd(),'data')

## Load original data 

In [9]:
# import data
data = pd.read_json(os.path.join(data_path, 'supreme_movie_dataset.json'))

## - Or load clean_data

In [70]:
clean_data = pd.read_json(os.path.join(data_path, 'clean_data.json'))

In [22]:
data.sort_values(by=['tmdb.popularity'], ascending=False).head(100)

Unnamed: 0,imdb.actors,imdb.budget,imdb.budgetCurrency,imdb.color,imdb.country,imdb.coverLink,imdb.directors,imdb.genres,imdb.imdbLink,imdb.originalLanguage,...,tmdb.runtime,tmdb.similar,tmdb.spoken_languages,tmdb.status,tmdb.tagline,tmdb.title,tmdb.video,tmdb.vote_average,tmdb.vote_count,rank
1083,"[Ginnifer Goodwin, Jason Bateman, Idris Elba]",150000000,$,Color,USA,https://ia.media-imdb.com/images/M/MV5BOTMyMjE...,"[Byron Howard, Rich Moore]","[Animation, Adventure, Comedy, Crime, Family, ...",http://www.imdb.com/title/tt2948356/,English,...,108.0,"[101299, 4771, 9654, 13310, 14869, 101, 137094...","[{'name': 'English', 'iso_639_1': 'en'}]",Released,Welcome to the urban jungle.,Zootopia,0.0,7.7,6798.0,1.0
9897,"[Chris Hemsworth, Tom Hiddleston, Cate Blanchett]",180000000,$,Color,USA,https://ia.media-imdb.com/images/M/MV5BMjMyNDk...,[Taika Waititi],"[Action, Adventure, Comedy, Fantasy, Sci-Fi]",http://www.imdb.com/title/tt3501632/,English,...,130.0,"[1924, 1452, 99861, 141052, 8536, 9531, 40662,...","[{'name': 'English', 'iso_639_1': 'en'}, {'nam...",Released,No Hammer. No Problem.,Thor: Ragnarok,0.0,7.4,5351.0,1.0
20920,"[Emma Watson, Dan Stevens, Luke Evans]",160000000,$,Color,USA,https://ia.media-imdb.com/images/M/MV5BMTUwNjU...,[Bill Condon],"[Family, Fantasy, Musical, Romance]",http://www.imdb.com/title/tt2771200/,English,...,129.0,"[10882, 197796, 62213, 808, 812, 224141, 38757...","[{'name': 'Bahasa indonesia', 'iso_639_1': 'id'}]",Released,Be our guest.,Beauty and the Beast,0.0,6.8,7861.0,1.0
20104,"[Stars: Dylan O'Brien, Kaya Scodelario, Will P...",34000000,$,Color,USA,http://ia.media-imdb.com/images/M/MV5BMjUyNTA3...,[Wes Ball],"[Action, Mystery, Sci-Fi, Thriller]",http://www.imdb.com/title/tt1790864/,English,...,113.0,"[157350, 101299, 294254, 3509, 10803, 2048, 76...","[{'name': 'English', 'iso_639_1': 'en'}]",Released,Remember. Survive. Run.,The Maze Runner,0.0,7.0,7507.0,1.0
21935,"[Ben Affleck, Gal Gadot, Jason Momoa]",300000000,$,Color,USA,https://ia.media-imdb.com/images/M/MV5BYWVhZjZ...,[Zack Snyder],"[Action, Adventure, Fantasy, Sci-Fi]",http://www.imdb.com/title/tt0974015/,English,...,120.0,"[1452, 364, 1924, 8536, 9531, 13640, 155, 414,...","[{'name': 'English', 'iso_639_1': 'en'}]",Released,,Justice League,0.0,6.4,3973.0,1.0
22414,"[Stars: Chris Pratt, Vin Diesel, Bradley Coope...",170000000,$,Color,USA,http://ia.media-imdb.com/images/M/MV5BMTAwMjU5...,[James Gunn],"[Action, Adventure, Sci-Fi]",http://www.imdb.com/title/tt2015381/,English,...,121.0,"[283995, 299534, 299536, 10138, 10195, 1724, 1...","[{'name': 'English', 'iso_639_1': 'en'}]",Released,All heroes start somewhere.,Guardians of the Galaxy,0.0,7.9,12383.0,1.0
1170,"[Jacob Latimore, Seychelle Gabriel, Storm Reid]",250000,$,Color,USA,https://ia.media-imdb.com/images/M/MV5BODRjMDR...,[J.D. Dillard],"[Action, Drama, Sci-Fi, Thriller]",http://www.imdb.com/title/tt4573516/,English,...,89.0,"[275, 562, 2163, 193893, 949, 2034, 281, 3580,...","[{'name': 'English', 'iso_639_1': 'en'}]",Released,You can change the cards you're dealt,Sleight,0.0,5.2,291.0,1.0
16646,"[Stars: Keanu Reeves, Michael Nyqvist, Alfie A...",20000000,$,Color,USA,http://ia.media-imdb.com/images/M/MV5BMTU2NjA1...,"[Chad Stahelski, David Leitch]","[Action, Thriller]",http://www.imdb.com/title/tt2911666/,English,...,101.0,"[357096, 10651, 291356, 39918, 979, 281, 15602...","[{'name': 'Magyar', 'iso_639_1': 'hu'}, {'name...",Released,Don't set him off.,John Wick,0.0,7.0,7176.0,1.0
10548,"[Stars: Jennifer Lawrence, Josh Hutcherson, Li...",125000000,$,Color,USA,http://ia.media-imdb.com/images/M/MV5BMTcxNDI2...,[Francis Lawrence],"[Adventure, Sci-Fi, Thriller]",http://www.imdb.com/title/tt1951265/,English,...,123.0,"[70160, 101299, 12244, 281, 76341, 198663, 294...","[{'name': 'English', 'iso_639_1': 'en'}]",Released,Fire burns brighter in the darkness,The Hunger Games: Mockingjay - Part 1,0.0,6.7,7309.0,1.0
18766,"[Anna Kendrick, Rebel Wilson, Brittany Snow]",45000000,$,Color,USA,https://ia.media-imdb.com/images/M/MV5BMTU5NDI...,[Trish Sie],"[Comedy, Music]",http://www.imdb.com/title/tt4765284/,English,...,93.0,"[254470, 353460, 109048, 360239, 114150, 703, ...","[{'name': 'English', 'iso_639_1': 'en'}]",Released,Last Call Pitches,Pitch Perfect 3,0.0,6.4,727.0,1.0


## - Or load clean binarized data

In [7]:
clean_data = pd.read_json(os.path.join(data_path, 'clean_data_binarized.json'))

ValueError: Expected object or value

## Ananlysing Original Data

In [98]:
# quick look at the movies dataset
data.head()

Unnamed: 0,imdb.actors,imdb.budget,imdb.budgetCurrency,imdb.color,imdb.country,imdb.coverLink,imdb.directors,imdb.genres,imdb.imdbLink,imdb.originalLanguage,...,tmdb.reviews,tmdb.runtime,tmdb.similar,tmdb.spoken_languages,tmdb.status,tmdb.tagline,tmdb.title,tmdb.video,tmdb.vote_average,tmdb.vote_count
0,"[Stars: Peter Scanavino, Jason Robards III, An...",0,,Color,USA,http://ia.media-imdb.com/images/M/MV5BMjI3NTE5...,[Vladan Nikolic],"[Sci-Fi, Thriller]",http://www.imdb.com/title/tt1363468/,English,...,[],93.0,"[17654, 51828, 41630, 200727, 38, 18239, 388, ...","[{'name': 'English', 'iso_639_1': 'en'}]",Released,The Film They Don't Want You To See,Zenith,0.0,5.1,9.0
1,"[Mark Albrow, Stefan Alt, Chris Carter]",0,,Color,France,,"[Travis Collins, Amélie Ravalec]","[Documentary, History, Music]",http://www.imdb.com/title/tt3606298/,English,...,[],52.0,"[450794, 20575, 786, 525, 468775, 468769, 1584...","[{'name': 'English', 'iso_639_1': 'en'}]",Released,,Industrial Soundtrack for the Urban Decay,0.0,7.7,3.0
10,"[Crispin Glover, Allisyn Ashley Arm, Terry Moore]",0,,Color,USA,,[Hooroo Jackson],"[Drama, Fantasy, Sci-Fi, Thriller]",http://www.imdb.com/title/tt3550078/,English,...,[],75.0,"[376290, 15472, 293768, 297762, 390582, 70, 50...","[{'name': 'English', 'iso_639_1': 'en'}]",Released,,Aimy in a Cage,0.0,5.1,9.0
100,"[Stars: Grete Havnesköld, Linn Gloppestad, Mar...",0,,Color,Sweden,,[Johanna Hald],[Family],http://www.imdb.com/title/tt0107444/,Swedish,...,[],79.0,"[394117, 49519, 93456, 258480, 17130, 5123, 39...","[{'name': 'svenska', 'iso_639_1': 'sv'}]",Released,,Lotta Leaves Home,0.0,5.8,5.0
1000,"[Dario Yazbek Bernal, Marimar Vega, José María...",0,,Color,Mexico,https://ia.media-imdb.com/images/M/MV5BMjIzMjU...,[Michel Franco],"[Drama, Thriller]",http://www.imdb.com/title/tt1392888/,Spanish,...,[],90.0,"[8067, 4771, 984, 2149, 11826, 80752, 562, 284...","[{'name': 'English', 'iso_639_1': 'en'}, {'nam...",Released,,Daniel & Ana,0.0,5.9,11.0


#### All columns

In [99]:
list(data)

['imdb.actors',
 'imdb.budget',
 'imdb.budgetCurrency',
 'imdb.color',
 'imdb.country',
 'imdb.coverLink',
 'imdb.directors',
 'imdb.genres',
 'imdb.imdbLink',
 'imdb.originalLanguage',
 'imdb.productionCompanies',
 'imdb.reviews',
 'imdb.runtime',
 'imdb.summaries',
 'imdb.synopsis',
 'imdb.writers',
 'movielens.actors',
 'movielens.avgRating',
 'movielens.directors',
 'movielens.dvdReleaseDate',
 'movielens.genres',
 'movielens.imdbMovieId',
 'movielens.languages',
 'movielens.movieId',
 'movielens.mpaa',
 'movielens.numRatings',
 'movielens.originalTitle',
 'movielens.plotSummary',
 'movielens.posterPath',
 'movielens.releaseDate',
 'movielens.releaseYear',
 'movielens.runtime',
 'movielens.title',
 'movielens.tmdbMovieId',
 'movielens.youtubeTrailerIds',
 'movielensId',
 'tmdb.adult',
 'tmdb.backdrop_path',
 'tmdb.belongs_to_collection',
 'tmdb.belongs_to_collection.backdrop_path',
 'tmdb.belongs_to_collection.id',
 'tmdb.belongs_to_collection.name',
 'tmdb.belongs_to_collection.po

#### Drop unimportant columns

In [100]:
clean_data = data.drop(['imdb.budget', 'imdb.budgetCurrency', 'imdb.color', 'imdb.coverLink',
                        'imdb.imdbLink', 'imdb.reviews','movielens.imdbMovieId', 'imdb.synopsis',
                        'movielens.posterPath', 'tmdb.reviews', 'tmdb.status', 'tmdb.tagline', 
                        'tmdb.video', 'tmdb.backdrop_path', 'tmdb.belongs_to_collection', 
                        'tmdb.belongs_to_collection.backdrop_path','tmdb.belongs_to_collection.id', 
                        'tmdb.belongs_to_collection.name','tmdb.belongs_to_collection.poster_path', 
                        'movielens.dvdReleaseDate', 'movielens.mpaa','tmdb.poster_path', 'tmdb.homepage', 
                        'tmdb.revenue', 'tmdb.runtime', 'imdb.directors', 'movielens.releaseDate',
                        'movielens.runtime', 'movielens.originalTitle','imdb.originalLanguage',  
                        'imdb.summaries', 'tmdb.genres', 'tmdb.credits.crew','tmdb.credits.cast',
                        'tmdb.overview','movielens.youtubeTrailerIds','imdb.country','movielens.genres',
                        'movielens.plotSummary','tmdb.production_companies','tmdb.release_date',
                        'movielens.actors',
                        
                       ], axis=1)

#### Remove rows 403 rows with missing tmdb values

In [101]:
clean_data = clean_data[clean_data['tmdb.adult'].notnull()]

#### Store data in new json file

In [102]:
clean_data.to_json(os.path.join(data_path, 'clean_data.json'))

#### Number of remaining rows 

In [8]:
clean_data.shape

(24151, 236)

#### Count null fields for each column (Now there are no null fields anymore)

In [6]:
null_columns=clean_data.columns[clean_data.isnull().any()]
clean_data[null_columns].isnull().sum()

Series([], dtype: float64)

#### Remaining features

In [7]:
list(clean_data)

['imdb.actors',
 'imdb.genres',
 'imdb.productionCompanies',
 'imdb.runtime',
 'imdb.writers',
 'movielens.avgRating',
 'movielens.directors',
 'movielens.languages',
 'movielens.movieId',
 'movielens.numRatings',
 'movielens.releaseYear',
 'movielens.title',
 'movielens.tmdbMovieId',
 'movielensId',
 'tmdb.adult',
 'tmdb.budget',
 'tmdb.id',
 'tmdb.imdb_id',
 'tmdb.keywords',
 'tmdb.original_language',
 'tmdb.original_title',
 'tmdb.popularity',
 'tmdb.production_countries',
 'tmdb.recommendations',
 'tmdb.similar',
 'tmdb.spoken_languages',
 'tmdb.title',
 'tmdb.vote_average',
 'tmdb.vote_count',
 'Action',
 'Adult',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Short',
 'Sport',
 'Talk-Show',
 'Thriller',
 'War',
 'Western',
 'Adam Sandler',
 'Alexandra Daddario',
 'Amber Heard',
 'Amy Adams',
 'Anna

#### Binarize nominal values

In [106]:
def genre_one_hot(movie_dataset, column): 
    'Returns the given movie dataset with a transformed to one-hot encoded genre column.'

    # MultiLabelBinarizer is a very fast solution for one hot encoding on large dataframes
    mlb = MultiLabelBinarizer()
    one_hot_genre = pd.DataFrame(mlb.fit_transform(movie_dataset[column]),
                                 columns=mlb.classes_, 
                                 index=movie_dataset.index)

    movie_dataset = pd.concat([movie_dataset, one_hot_genre], sort=False, axis=1)
    movie_dataset.drop(column, axis=1)
    
    return movie_dataset

#### Drop unpopoluar columns

In [107]:
def drop_unpopular_columns(movies, start_index, popular_file, top):   
    popular_list = pd.read_json(os.path.join(data_path, popular_file))
    popular_list = popular_list.head(top)
    
    unpop_columns = movies.drop(movies.columns[:(1+start_index)], axis=1)
    
    #drop popular columns 
    for k, v in popular_list.iterrows():
        if v['name'] in movies.columns: 
            unpop_columns.drop([v['name']], axis=1, inplace=True, errors="ignore")
    
    return movies.drop(unpop_columns, axis=1)

#### Start with binarization and removing unpopular columns

In [108]:
#reset index
clean_data = clean_data.reset_index(drop=True)
print('dimension of data: ', clean_data.shape[1])

print('binarizing imdb.genres...')
clean_data = genre_one_hot(clean_data, 'imdb.genres')
print('new dimension of data: ', clean_data.shape[1])

print('binarizing imdb.actors...')
clean_data = genre_one_hot(clean_data, 'imdb.actors')
print('new dimension of data: ', clean_data.shape[1])

print('droping unpopular actors...')
clean_data = drop_unpopular_columns(clean_data, clean_data.columns.get_loc('Western'), 'popular_actors.json', top=100)
print('new dimension of data: ', clean_data.shape[1])

print('binarizing imdb.directors')
clean_data = genre_one_hot(clean_data, 'movielens.directors')
print('new dimension of data: ', clean_data.shape[1])

print('droping unpopular directors...')
clean_data = drop_unpopular_columns(clean_data, clean_data.columns.get_loc('Zoe Saldana'), 'top_directors.json', top=10)
print('new dimension of data: ', clean_data.shape[1])

print('binarizing movielens.languages')
clean_data = genre_one_hot(clean_data, 'movielens.languages')
print('new dimension of data: ', clean_data.shape[1])

dimension of data:  29
binarizing imdb.genres...
new dimension of data:  56
binarizing imdb.actors...
new dimension of data:  37025
droping unpopular actors...
new dimension of data:  145
binarizing imdb.directors
new dimension of data:  15108
droping unpopular directors...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


new dimension of data:  165
binarizing movielens.languages
new dimension of data:  236


In [13]:
clean_data.head()

Unnamed: 0,imdb.actors,imdb.genres,imdb.productionCompanies,imdb.runtime,imdb.writers,movielens.avgRating,movielens.directors,movielens.languages,movielens.movieId,movielens.numRatings,...,বাংলা,ਪੰਜਾਬੀ,தமிழ்,తెలుగు,ภาษาไทย,ქართული,广州话 / 廣州話,日本語,普通话,한국어/조선말
0,"[Stars: Peter Scanavino, Jason Robards III, An...","[Sci-Fi, Thriller]","[Surla Films, Solo Films]",93,[Vladan Nikolic],2.5,[Vladan Nikolić],[English],127292,3,...,0,0,0,0,0,0,0,0,0,0
1,"[Mark Albrow, Stefan Alt, Chris Carter]","[Documentary, History, Music]","[Amelie Ravalec, Les Films du Garage]",52,[Amélie Ravalec],4.0,"[Travis Collins, Amélie Ravalec]",[English],141044,1,...,0,0,0,0,0,0,0,0,0,0
10,"[Walter Koenig, Bruce Campbell, Leigh Lombardi]","[Horror, Sci-Fi]","[Magic Films, Shapiro-Glickenhaus Entertainment]",92,[Tex Ragsdale],2.5,[Robert Dyke],[English],171725,6,...,0,0,0,0,0,0,0,0,0,0
100,"[The Beatles, Paul McCartney, Ringo Starr]","[Documentary, Music]","[Apple Corps, Apple Corps, Diamond Docs]",106,"[Mark Monroe, P.G. Morgan]",3.77778,[Ron Howard],[English],163949,72,...,0,0,0,0,0,0,0,0,0,0
1000,"[José Garcia, Renée Le Calm, Isabelle Carré]",[Comedy],"[Fidélité Productions, StudioCanal, TF1 Films ...",106,"[Olivier Dazat, Christian Vincent]",3.5,[Christian Vincent],[Français],149516,3,...,0,0,0,0,0,0,0,0,0,0


#### Store binarized data in csv file

In [146]:
cols=pd.Series(clean_data.columns)
# some columns are duplicated because there are actors who are also directors 
# rename the dublicated column with "name.i" (i=1..n)
for dup in cols[cols.duplicated()].unique(): 
    cols[cols[cols == dup].index.values.tolist()] = [dup + '.' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
clean_data.columns=cols

clean_data.to_json(os.path.join(data_path, 'clean_data_binarized.json'))

## Estimate missing values with SVC Classifier (Not used for dataset just for trying)

In [22]:
#get rows where the given attribute is null
no_original_language = clean_data[clean_data['tmdb.original_language'].isnull()]
has_original_language = clean_data[clean_data['tmdb.original_language'].notnull()]

#remove rows with missing country value
has_original_language = has_original_language[has_original_language['tmdb.production_countries'].notnull()]

train_data = has_original_language[has_original_language.columns[1+clean_data.columns.get_loc('Western'):]]
train_label = has_original_language['tmdb.original_language']

In [27]:
print(train_data.shape[0])
print(train_label.shape[0])

24151
24151


In [117]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_label, test_size=0.3, random_state=42)

clf = svm.SVC(gamma='scale')
clf.fit(X_train, y_train)  

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [118]:
y_pred = clf.predict(X_test)
res = pd.DataFrame({'actual':y_test, 'predicted':y_pred.flatten()})

In [119]:
precision_recall_fscore_support(y_test, y_pred, average='weighted')

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


(0.874003071904552, 0.8793817278498481, 0.8749349446340965, None)

In [188]:
y_pred = list(y_pred)
y_test = list(y_test)
len(set(y_pred ))

46

#### Plot confusion matrix

In [193]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    
    
    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    # Only use the labels that appear in the data
    blub = y_true + y_pred
    classes = set(blub)
   
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')


    fig, ax = plt.subplots(2000, 2000)
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
from sklearn.utils.multiclass import unique_labels
np.set_printoptions(precision=2)


# Plot non-normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
#plot_confusion_matrix(y_test, y_pred, classes=class_names, normalize=True,
#                      title='Normalized confusion matrix')

plt.show()

Confusion matrix, without normalization


#### Encode Labels

In [58]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(["paris", "paris", "tokyo", "amsterdam"])

#list(le.classes_)

#le.transform(["tokyo", "tokyo", "paris"]) 

#list(le.inverse_transform([2, 2, 1]))

LabelEncoder()