In [37]:
import numpy as np
import pandas as pd
import re
import sys
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import statsmodels.api as sm
from math import log10
from nltk.tokenize import RegexpTokenizer
from pandas.core import datetools
import seaborn
seaborn.set()

In [8]:
def inflation_correct(x):
    year_difference = 2019 - business['year']
    money_value = (1.0297) ** year_difference
    return x * money_value

def getyear(x):
    return x[0:4]

def corr_points(x):
    oscar_win = re.search(r'Won (\d+) Oscar', x)
    oscar_nominated = re.search(r'Nominated for (\d+) Oscar', x)
    GoldenG_win = re.search(r'Won (\d+) Golden', x)
    GoldenG_nominated = re.search(r'Nominated for (\d+) Golden', x)
    minor_wins = re.search(r'(\d+) win', x)
    minor_nominations = re.search(r'(\d+) nomination', x)
    score = 0
    if oscar_win:
        score = score + 10 * int(oscar_win.group(1))
    if oscar_nominated:
        score = score + 5 * int(oscar_nominated.group(1))
    if GoldenG_win:
        score = score + 7 * int(GoldenG_win.group(1))
    if GoldenG_nominated:
        score = score + 3 * int(GoldenG_nominated.group(1))
    if minor_wins:
        score = score + int(minor_wins.group(1))
    if minor_nominations:
        score = score + 0.25 * int(minor_nominations.group(1))
    return score

def transfer_to_number(x): 
    # transfer a dataframe column to number matrix
    types =[]
    type_size = 0
    for i in range(len(x)):
        for j in x[i]:
            if j not in types:
                types.append(j)
                type_size = type_size + 1
                
    two_D_array = [x[:] for x in [[0] * len(types)] * len(x)] 
    
    for i in range(len(x)):
        for j in x[i]:
            index = 0
            while j != types[index]:
                index = index + 1
            two_D_array[i][index] = 1        
    
    print('There are ', len(two_D_array[0]), ' different types in total.')
    print('There are ', len(two_D_array), ' records collected.')
    
    return two_D_array

def remove_rare_data(x,y,z):
    # remove the elements in attribute 'y', which occurs fewer than 'z' times ,from dataframe 'x'
    x_size = len(x[y])
    types = []
    times = []
    x1 = []
    y1 =[]
    mark = [[] for i in range(len(x[y]))]
    for i in range(x_size):
        for j in range(len(x[y][i])):
                if x[y][i][j] not in types:
                    types.append(x[y][i][j])
                    times.append(1)
                    x1.append([i])
                    y1.append([j])
                else:
                    index = types.index(x[y][i][j])
                    times[index] = times[index] + 1 
                    x1[index].append(i)
                    y1[index].append(j)
    for i in range(len(times)):
        if times[i] <= z:
            for j in range(times[i]):
                mark[x1[i][j]].append(y1[i][j])
    for i in range(x_size):
        if len(mark[i]) != 0:
            for j in sorted(mark[i], reverse=True):
                del x[y][i][j] 
    for i in range(x_size):
        if len(x[y][i]) == 0:
            x = x.drop(i)
    x = x.reset_index(drop = True)
    print('Remove rare elements in ', y,  ' successfully')
    return x

def remove_rare_data_array(x, y):
    measure = y
    
    # basically the same process as transfer_to_number() function
    types =[]
    type_size = 0
    for i in range(len(x)):
        for j in x[i]:
            if j not in types:
                types.append(j)
                type_size = type_size + 1
                
    two_D_array = [x[:] for x in [[0] * len(types)] * len(x)] 
    
    for i in range(len(x)):
        for j in x[i]:
            index = 0
            while j != types[index]:
                index = index + 1
            two_D_array[i][index] = 1    

    sum = [0]* len(two_D_array[0])
    # print(sum)
    for i in range(len(two_D_array)):
        for j in range(len(two_D_array[0])):   
            if two_D_array[i][j] == 1:
                # if there is a score (record) on this genre, mark it
                sum[j] = sum[j] + 1
    
    x_more_than_y_times = len(types)
    
    # if that element (e.g. director) appears only once, we set the whole column to be -1 
    for i in range(len(sum)):
        if sum[i] <= measure:
            x_more_than_y_times = x_more_than_y_times - 1
            for j in range(len(two_D_array)):
                two_D_array[j][i] = -1
    
    # delete the whole -1 column
    for i in range(len(two_D_array)):
        two_D_array[i] = [e for e in two_D_array[i] if e not in (-2, -1)]
    
    # remove movies with all 0's in a row. (No director)
    two_D_array = [v for v in two_D_array if sum(v) != 0]
    return two_D_array

def extract_first_2(x):
    if len(x) <= 2:
        return x
    else:
        return x[:2]
    
def extract_first_3(x):
    if len(x) <= 3:
        return x
    else:
        return x[:3]

def extract_first_4(x):
    if len(x) <= 4:
        return x
    else:
        return x[:4]

def to_list(x):
    return [x]

def combine_list(x,y):
    #combine two 2d lists together
    for i in range(len(x)):
        x[i].extend(y[i])
    return x

def model_test(model,X_test,y_test,a):
    # Using model "model" and input data "X_test", "y_test"
    accept = 0
    real = np.asarray(y_test, dtype="float64")
    predict = np.asarray(model.predict(X_test), dtype="float64")
    difference = np.absolute(real - predict)
    for i in range(real.size):
        if difference[i] <= a:
            accept = accept + 1
    score = accept/real.size
    print ('The corrected score for testing data is: ', score)
    return 0

In [5]:
wiki = pd.read_json('wiki-company.json.gz', orient='record', lines=True)
rt = pd.read_json('rotten-tomatoes.json.gz', orient='record', lines=True)
omdb = pd.read_json('omdb-data.json.gz', orient='record', lines=True)
data_temp = wiki.merge(rt, on = 'rotten_tomatoes_id')
data = data_temp.merge(omdb, left_on = 'imdb_id_x', right_on = 'imdb_id')

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9612 entries, 0 to 9611
Data columns (total 30 columns):
based_on              2076 non-null object
cast_member           8583 non-null object
country_of_origin     9586 non-null object
director              8886 non-null object
enwiki_title          9612 non-null object
filming_location      4254 non-null object
genre                 9612 non-null object
imdb_id_x             9612 non-null object
label                 9612 non-null object
made_profit           850 non-null float64
main_subject          2940 non-null object
metacritic_id         5223 non-null object
nbox                  985 non-null float64
ncost                 1071 non-null float64
original_language     9508 non-null object
production_company    5161 non-null object
publication_date      9610 non-null object
rotten_tomatoes_id    9612 non-null object
series                601 non-null object
wikidata_id           9612 non-null object
audience_average      8751 non-nu

In [15]:
business =  data[['director','cast_member','genre','production_company','ncost', 'nbox','publication_date']]
business = business.dropna(subset = ['director','cast_member','genre','production_company','ncost', 'nbox','publication_date']).reset_index(drop = True)

In [17]:
business['year'] = business['publication_date'].apply(getyear)
business['year'] = pd.to_numeric(business['year'])
business['profit'] = business['nbox'] - business['ncost']
business['year_difference'] = 2019 - business['year']
business['money_value'] = (1.0297) ** business['year_difference']
business['profit'] = business['profit']*business['money_value']

In [19]:
business['director'] = business['director'].apply(extract_first_2)
business['cast_member'] = business['cast_member'].apply(extract_first_4)
business['genre'] = business['genre'].apply(extract_first_3)

In [20]:
business = remove_rare_data(business, 'cast_member', 1)
business = remove_rare_data(business, 'director', 1)

Remove rare elements in  cast_member  successfully
Remove rare elements in  director  successfully


In [41]:
len(business)

392

In [22]:
business['production_company'] = business['production_company'].apply(to_list)

company = transfer_to_number(business['production_company'])
director = transfer_to_number(business['director'])
cast_member= transfer_to_number(business['cast_member'])
genre = transfer_to_number(business['genre'])

There are  135  different types in total.
There are  392  records collected.
There are  128  different types in total.
There are  392  records collected.
There are  473  different types in total.
There are  392  records collected.
There are  93  different types in total.
There are  392  records collected.


In [24]:
X = combine_list(director,cast_member)
X = combine_list(X, genre)
X = combine_list(X, company)
y_1 = np.asarray(business['profit'], dtype="|S6")

In [32]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, y_1)
model = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(20))
model.fit(X_train_1, y_train_1)
model.predict(X_test_1)
model_test(model, X_test_1,y_test_1,888888)
model.score(X_train_1, y_train_1)

The corrected score for testing data is:  0.9795918367346939


0.9965986394557823

In [35]:
model_forest = RandomForestClassifier(n_estimators = 100, max_depth = 5, min_samples_leaf = 5)

In [36]:
model_forest.fit(X_train_1, y_train_1)
model_test(model_forest, X_test_1,y_test_1,888888)
model_forest.score(X_train_1, y_train_1)

The corrected score for testing data is:  0.9897959183673469


0.40816326530612246

In [40]:
model_tree = DecisionTreeClassifier(max_depth=20)
model_tree.fit(X_train_1, y_train_1)
model_test(model_tree, X_test_1,y_test_1,888888)
model_tree.score(X_train_1, y_train_1)

The corrected score for testing data is:  0.9693877551020408


0.14965986394557823