In [18]:
import numpy as np
import pandas as pd
import re
import sys
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import statsmodels.api as sm
from math import log10
from nltk.tokenize import RegexpTokenizer
from sklearn.ensemble import RandomForestClassifier
from pandas.core import datetools
import seaborn
seaborn.set()

In [2]:
wiki = pd.read_json('wiki-company.json.gz', orient='record', lines=True)
rt = pd.read_json('rotten-tomatoes.json.gz', orient='record', lines=True)
omdb = pd.read_json('omdb-data.json.gz', orient='record', lines=True)
data_temp = wiki.merge(rt, on = 'rotten_tomatoes_id')
data = data_temp.merge(omdb, left_on = 'imdb_id_x', right_on = 'imdb_id')

In [3]:
def transfer_to_number(x): 
    # transfer a dataframe column to number matrix
    types =[]
    type_size = 0
    for i in range(len(x)):
        for j in x[i]:
            if j not in types:
                types.append(j)
                type_size = type_size + 1
                
    two_D_array = [x[:] for x in [[0] * len(types)] * len(x)] 
    
    for i in range(len(x)):
        for j in x[i]:
            index = 0
            while j != types[index]:
                index = index + 1
            two_D_array[i][index] = 1        
    
    print('There are ', len(two_D_array[0]), ' different types in total.')
    print('There are ', len(two_D_array), ' records collected.')
    
    return two_D_array

def remove_rare_data(x,y,z):
    # remove the elements in attribute 'y', which occurs fewer than 'z' times ,from dataframe 'x'
    x_size = len(x[y])
    types = []
    times = []
    x1 = []
    y1 =[]
    mark = [[] for i in range(len(x[y]))]
    for i in range(x_size):
        for j in range(len(x[y][i])):
                if x[y][i][j] not in types:
                    types.append(x[y][i][j])
                    times.append(1)
                    x1.append([i])
                    y1.append([j])
                else:
                    index = types.index(x[y][i][j])
                    times[index] = times[index] + 1 
                    x1[index].append(i)
                    y1[index].append(j)
    for i in range(len(times)):
        if times[i] <= z:
            for j in range(times[i]):
                mark[x1[i][j]].append(y1[i][j])
    for i in range(x_size):
        if len(mark[i]) != 0:
            for j in sorted(mark[i], reverse=True):
                del x[y][i][j] 
    for i in range(x_size):
        if len(x[y][i]) == 0:
            x = x.drop(i)
    x = x.reset_index(drop = True)
    print('Remove rare elements in ', y,  ' successfully')
    return x

def remove_rare_data_array(x, y):
    measure = y
    
    # basically the same process as transfer_to_number() function
    types =[]
    type_size = 0
    for i in range(len(x)):
        for j in x[i]:
            if j not in types:
                types.append(j)
                type_size = type_size + 1
                
    two_D_array = [x[:] for x in [[0] * len(types)] * len(x)] 
    
    for i in range(len(x)):
        for j in x[i]:
            index = 0
            while j != types[index]:
                index = index + 1
            two_D_array[i][index] = 1    

    sum = [0]* len(two_D_array[0])
    # print(sum)
    for i in range(len(two_D_array)):
        for j in range(len(two_D_array[0])):   
            if two_D_array[i][j] == 1:
                # if there is a score (record) on this genre, mark it
                sum[j] = sum[j] + 1
    
    x_more_than_y_times = len(types)
    
    # if that element (e.g. director) appears only once, we set the whole column to be -1 
    for i in range(len(sum)):
        if sum[i] <= measure:
            x_more_than_y_times = x_more_than_y_times - 1
            for j in range(len(two_D_array)):
                two_D_array[j][i] = -1
    
    # delete the whole -1 column
    for i in range(len(two_D_array)):
        two_D_array[i] = [e for e in two_D_array[i] if e not in (-2, -1)]
    
    # remove movies with all 0's in a row. (No director)
    two_D_array = [v for v in two_D_array if sum(v) != 0]
    return two_D_array

def extract_first_2(x):
    if len(x) <= 2:
        return x
    else:
        return x[:2]
    
def extract_first_3(x):
    if len(x) <= 3:
        return x
    else:
        return x[:3]

def extract_first_4(x):
    if len(x) <= 4:
        return x
    else:
        return x[:4]

def to_list(x):
    return [x]

def combine_list(x,y):
    #combine two 2d lists together
    for i in range(len(x)):
        x[i].extend(y[i])
    return x

def model_test(model,X_test,y_test,a):
    # Using model "model" and input data "X_test", "y_test"
    accept = 0
    real = np.asarray(y_test, dtype="float64")
    predict = np.asarray(model.predict(X_test), dtype="float64")
    difference = np.absolute(real - predict)
    for i in range(real.size):
        if difference[i] <= a:
            accept = accept + 1
    score = accept/real.size
    print ('The corrected score for testing data is: ', score)
    return 0

In [23]:
# data.info()

In [5]:
public = data[['director','cast_member','genre','production_company','audience_average','critic_average', 'audience_ratings']]
public = public.dropna(subset = ['director','cast_member','genre','production_company','audience_average','critic_average', 'audience_ratings'])
public = public[public['audience_ratings']>=40].reset_index(drop = True)


In [24]:
len(public)

2867

In [7]:
public['director'] = public['director'].apply(extract_first_2)
public['cast_member'] = public['cast_member'].apply(extract_first_4)
public['genre'] = public['genre'].apply(extract_first_3)

In [8]:
public = remove_rare_data(public, 'cast_member', 1)
public = remove_rare_data(public, 'director', 1)

Remove rare elements in  cast_member  successfully
Remove rare elements in  director  successfully


In [9]:
public['production_company'] = public['production_company'].apply(to_list)

company = transfer_to_number(public['production_company'])
director = transfer_to_number(public['director'])
cast_member= transfer_to_number(public['cast_member'])
genre = transfer_to_number(public['genre'])

There are  500  different types in total.
There are  2867  records collected.
There are  770  different types in total.
There are  2867  records collected.
There are  2387  different types in total.
There are  2867  records collected.
There are  161  different types in total.
There are  2867  records collected.


In [10]:
X = combine_list(director,cast_member)
X = combine_list(X, genre)
X = combine_list(X, company)
y_1 = np.asarray(public['audience_average'], dtype="|S6")

In [11]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, y_1)
model = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(20))
model.fit(X_train_1, y_train_1)
model.predict(X_test_1)
model_test(model, X_test_1,y_test_1,0.15)
model.score(X_train_1, y_train_1)

The corrected score for testing data is:  0.2928870292887029


0.9990697674418605

In [12]:
y_2 = np.asarray(public['critic_average'], dtype="|S6")
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y_2)
model1 = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(5))
model1.fit(X_train_2, y_train_2)
model1.predict(X_test_2)
model_test(model1, X_test_2,y_test_2,0.2)
model1.score(X_train_2, y_train_2)

The corrected score for testing data is:  0.05718270571827057


0.9455813953488372

In [13]:
new_X = combine_list(director, cast_member)
new_X = combine_list(new_X, genre)
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(new_X, y_2)
model2 = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(10))
model2.fit(X_train_3, y_train_3)
model2.predict(X_test_3)
model_test(model2, X_test_3,y_test_3,0.2)
model2.score(X_train_3, y_train_3)

The corrected score for testing data is:  0.07670850767085077


0.9995348837209302

In [14]:
model_forest = RandomForestClassifier(n_estimators = 200, max_depth = 10, min_samples_leaf = 10)

In [15]:
model_forest.fit(X_train_1, y_train_1)
model_test(model_forest, X_test_1,y_test_1,0.25)
model_forest.score(X_train_1, y_train_1)

The corrected score for testing data is:  0.43235704323570434


0.14604651162790697

In [16]:
model_forest.fit(X_train_3, y_train_3)
model_test(model_forest, X_test_3,y_test_3,0.25)
model_forest.score(X_train_3, y_train_3)

The corrected score for testing data is:  0.13807531380753138


0.10093023255813953

In [22]:
# Wow wow wow this is so good.
model_tree = DecisionTreeClassifier(max_depth=50)
model_tree.fit(X_train_1, y_train_1)
model_test(model_tree, X_test_1,y_test_1,0.2)
model_tree.score(X_train_1, y_train_1)

The corrected score for testing data is:  0.3333333333333333


0.29255813953488374