In [1]:
import numpy as np
import pandas as pd
import re
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import statsmodels.api as sm
from math import log10
from nltk.tokenize import RegexpTokenizer
import seaborn
seaborn.set()

In [2]:
wiki = pd.read_json('wiki-company.json.gz', orient='record', lines=True)
rt = pd.read_json('rotten-tomatoes.json.gz', orient='record', lines=True)
omdb = pd.read_json('omdb-data.json.gz', orient='record', lines=True)
data_temp = wiki.merge(rt, on = 'rotten_tomatoes_id')
data = data_temp.merge(omdb, left_on = 'imdb_id_x', right_on = 'imdb_id')

In [3]:
# drop NaN columns in 'nbox' and 'ncost'
wiki_dropNA = wiki.dropna(subset=['nbox', 'ncost', 'publication_date'])
# We want to get Profit by 'nbox' and 'ncost'
wiki_dropNA_box = wiki_dropNA[['nbox','ncost', 'publication_date','rotten_tomatoes_id']]

# we want to get only reviews from audiences and critics
rt_reviews = rt[['audience_average', 'audience_percent', 'audience_ratings', 'critic_average', 'critic_percent', 'rotten_tomatoes_id']]

# Merge (join) these two tables by common Rotten Tomatoes ID
wiki_join_rt = wiki_dropNA_box.merge(rt_reviews, left_on='rotten_tomatoes_id', right_on='rotten_tomatoes_id', suffixes=('_wiki', '_rt'))



# Filter data a little bit. We just keep movies with more than 40 reviews (Discard very uncommon movies)
wiki_join_rt = wiki_join_rt[wiki_join_rt['audience_ratings']>=40]
wiki_join_rt = wiki_join_rt.dropna(subset=['critic_average', 'critic_percent'])

# Note that there are 817 movies remained, which is not bad though

# Inflation adjustment on the profit

def getyear(x):
    return x[0:4]

wiki_join_rt['year'] = wiki_join_rt['publication_date'].apply(getyear)
wiki_join_rt['year'] = pd.to_numeric(wiki_join_rt['year'])

# wiki_join_rt['year'].min()
# This shows the oldest movie comes from 1927
# Through the research on references, from 1927 to 2019, the value of US dollars has an inflation of 2.97% per year on average.
# We adjust this number according to year differences compared to 2019

def inflation_correct(x):
    year_difference = 2019 - wiki_join_rt['year']
    money_value = (1.0297) ** year_difference
    return x * money_value

# Calculate the profit (Or, loss if nbox < ncost)
wiki_join_rt['profit'] = wiki_join_rt['nbox'] - wiki_join_rt['ncost']

wiki_join_rt['year_difference'] = 2019 - wiki_join_rt['year']
wiki_join_rt['money_value'] = (1.0297) ** wiki_join_rt['year_difference']
# wiki_join_rt['profit'] = wiki_join_rt['profit'].apply(inflation_correct)
# Calculate the "real" profit for each movie, based on its publication year and real value of money.
wiki_join_rt['profit'] = wiki_join_rt['profit']*wiki_join_rt['money_value']

In [4]:
# wiki_join_rt[nbox	ncost	publication_date	rotten_tomatoes_id	audience_average	
# audience_percent	audience_ratings	
# critic_average	critic_percent	profit	year	year_difference, money_value]

In [5]:
# wiki

In [6]:
def corr_points(x):
    oscar_win = re.search(r'Won (\d+) Oscar', x)
    oscar_nominated = re.search(r'Nominated for (\d+) Oscar', x)
    GoldenG_win = re.search(r'Won (\d+) Golden', x)
    GoldenG_nominated = re.search(r'Nominated for (\d+) Golden', x)
    minor_wins = re.search(r'(\d+) win', x)
    minor_nominations = re.search(r'(\d+) nomination', x)
    score = 0
    if oscar_win:
        score = score + 5 * int(oscar_win.group(1))
    if oscar_nominated:
        score = score + 3 * int(oscar_nominated.group(1))
    if GoldenG_win:
        score = score + 4 * int(GoldenG_win.group(1))
    if GoldenG_nominated:
        score = score + 2 * int(GoldenG_nominated.group(1))
    if minor_wins:
        score = score + int(minor_wins.group(1))
    if minor_nominations:
        score = score + 0.25 * int(minor_nominations.group(1))
    return score

In [7]:
omdb['weighted_points'] = omdb['omdb_awards'].map(corr_points)

In [8]:
# omdb

In [9]:
# director = to_number(data['director'])

In [10]:
data['weighted_points'] = data['omdb_awards'].map(corr_points)
data = data[['director','cast_member','genre','omdb_awards','audience_average','weighted_points']]
data = data.dropna(subset = ['director','cast_member','genre','omdb_awards','audience_average','weighted_points']).reset_index(drop = True)

In [11]:
def extract_first_2(x):
    if len(x) <= 1:
        return x
    else:
        return x[:2]

def extract_first_4(x):
    if len(x) <= 4:
        return x
    else:
        return x[:4]

# This does not work. Probably because it is not the latest Pandas
# temp = data['director'].apply(extract_first_2, args = (1,))
data['director'] = data['director'].apply(extract_first_2)
data['cast_member'] = data['cast_member'].apply(extract_first_4)
print(len(data))
# data = small_feature_index(data, 'cast_member', 1)
# data = small_feature_index(data, 'director', 1)

7510


In [12]:
data

Unnamed: 0,director,cast_member,genre,omdb_awards,audience_average,weighted_points
0,[Q2593],"[Q440956, Q312712, Q42869, Q45923]","[Q188473, Q1146335, Q157443, Q1535153]",19 wins & 61 nominations.,3.9,34.25
1,"[Q51583, Q51472]","[Q34851, Q151973, Q181887, Q283988]","[Q645928, Q369747, Q1054574, Q130232, Q1433443]",Won 4 Oscars. Another 2 wins & 13 nominations.,3.4,25.25
2,[Q456008],"[Q42204, Q37876, Q313042, Q200405]","[Q2484376, Q20443008, Q3990883, Q622291, Q7210...",7 wins & 27 nominations.,3.9,13.75
3,[Q188137],"[Q108941, Q81328, Q210447, Q4678990]","[Q471839, Q319221, Q188473, Q157394]",Nominated for 5 Oscars. Another 57 wins & 123 ...,4.3,102.75
4,[Q8877],"[Q258064, Q354873, Q506198, Q676094]","[Q471839, Q2143665, Q130232]",Won 4 Oscars. Another 47 wins & 34 nominations.,3.5,75.50
5,[Q387412],"[Q7516, Q13909, Q461238, Q29086]","[Q1146335, Q580850, Q19367312, Q959790, Q2484376]",,3.2,0.00
6,[Q282033],"[Q81520, Q191104, Q230055, Q190794]","[Q3990883, Q2297927, Q188473]",Nominated for 2 BAFTA Film Awards. Another 2 w...,3.4,3.50
7,[Q51552],"[Q37175, Q228862, Q310944, Q234890]","[Q1200678, Q590103, Q2484376, Q2973181, Q52162...",1 win & 4 nominations.,3.2,2.00
8,"[Q25078, Q166159]","[Q24962, Q25014, Q25078, Q166159]","[Q157394, Q319221, Q542475, Q157443, Q622548, ...",2 wins & 2 nominations.,4.1,2.50
9,[Q122666],"[Q105158, Q255070, Q320093, Q432940]","[Q959790, Q1200678, Q590103, Q2484376]",1 win.,2.9,1.00


In [13]:
def transfer_to_number(x): 
    types =[]
    type_size = 0
    for i in range(len(x)):
        for j in x[i]:
            if j not in types:
                types.append(j)
                type_size = type_size + 1
                
    two_D_array = [x[:] for x in [[0] * len(types)] * len(x)] 
    
    for i in range(len(x)):
        for j in x[i]:
            index = 0
            while j != types[index]:
                index = index + 1
            two_D_array[i][index] = 1        
    
    #print(len(two_D_array))
    print('There are ', len(two_D_array[0]), ' different types in total.')
    print('There are ', len(two_D_array), ' records collected.')
    #print(len(two_D_array[0]))
    # print(types)
    # print(type_size)
    return two_D_array
                

In [14]:
# arr = transfer_to_number(data['director'])
# arr2 = transfer_to_number(data['cast_member'])
# arr3 = transfer_to_number(data['genre'])

In [15]:
def remove_rare_data(x, y):
    measure = y
    
    types =[]
    type_size = 0
    for i in range(len(x)):
        for j in x[i]:
            if j not in types:
                types.append(j)
                type_size = type_size + 1
                
    two_D_array = [x[:] for x in [[0] * len(types)] * len(x)] 
    
    for i in range(len(x)):
        for j in x[i]:
            index = 0
            while j != types[index]:
                index = index + 1
            two_D_array[i][index] = 1    
            
    # print(two_D_array[800])
    # print(len(two_D_array))
    # print(two_D_array.shape[0])
    sum = [0]* len(two_D_array[0])
    # print(sum)
    for i in range(len(two_D_array)):
        for j in range(len(two_D_array[0])):   
            if two_D_array[i][j] == 1:
            # there is a score (record) on this genre
                sum[j] = sum[j] + 1
    
    # two_D_array = np.array(two_D_array)
    # print(sum)
    x_more_than_y_times = len(types)
    
    for i in range(len(sum)):
        if sum[i] <= measure:
            x_more_than_y_times = x_more_than_y_times - 1
            for j in range(len(two_D_array)):
                two_D_array[j][i] = -1
    
    for i in range(len(two_D_array)):
        two_D_array[i] = [e for e in two_D_array[i] if e not in (-2, -1)]
    
    # print(two_D_array)
    # two_D_array = [v for v in two_D_array if sum(v) != 0]
    
    # print(x_more_than_y_times)
    #two_D_array = np.array(two_D_array)
    #two_D_array = two_D_array[~np.all(two_D_array == 0, axis=1)]
    
    return two_D_array

def remove_0_rows(x):
    return  [v for v in x if sum(v) != 0]

In [16]:
# # aaaaaaaaaa
# aarray = [[-1,-1,3,4], [0,0,0,0],[-1,-1,8,9],[0,0,0,0] ,[-1,-1, 10 ,11],[0,0,1,0],[0,0,0,0]]
# #for i in range(len(aarray)):
#     #aarray[i] = [e for e in aarray[i] if e not in (-100, -1)]
# aarray = [v for v in aarray if sum(v) != 0]
#     #for i in range(count):
#         #aarray.remove(1)
# type(aarray)

In [17]:
# aa = np.array([[1,2,3,10], [3,4,10,15], [7,8,4,22]])
# aa1111 = np.delete(aa, 3,1)

In [18]:
# aa1111

In [None]:
# aa = to_number(wiki['director'])

In [None]:
# data['cast_member','director', ] potentially: production_company, country_of_origin, 

In [None]:
# 1. delete extra number  (have done first)
# 2. remove small numbers
# 3. to_number  (no needed anymore)
# 4. remove 0 rows

In [33]:
director = remove_rare_data(data['director'], 1)
director = remove_0_rows(director)

In [35]:
cast_member = remove_rare_data(data['cast_member'], 1)
cast_member = remove_0_rows(cast_member)

In [36]:
len(cast_member)

6858

In [67]:
def small_feature_index(x,y,z):
    #delete elements that shows no more than z times
    x_size = x[y].size
    category = []
    times = []
    x1 = []
    y1 =[]
    position = [[] for i in range(x_size)]
    for i in range(x_size):
        for j in range(len(x[y][i])):
                if x[y][i][j] not in category:
                    category.append(x[y][i][j])
                    times.append(1)
                    x1.append([i])
                    y1.append([j])
                else:
                    index = category.index(x[y][i][j])
                    times[index] = times[index] + 1 
                    x1[index].append(i)
                    y1[index].append(j)
    for i in range(len(times)):
        if times[i] <= z:
            for j in range(times[i]):
                position[x1[i][j]].append(y1[i][j])
    for i in range(x_size):
        if len(position[i]) != 0:
            for j in sorted(position[i], reverse=True):
                del x[y][i][j] 
    for i in range(x_size):
        if len(x[y][i]) == 0:
            x = x.drop(i)
    x = x.reset_index(drop = True)
    print('delete', y, 'rare parameter done')
    return x

In [68]:
temp = small_feature_index(data, 'director', 1)

delete director rare parameter done


Unnamed: 0,director,cast_member,genre,omdb_awards,audience_average,weighted_points
0,[Q2593],"[Q440956, Q312712, Q42869, Q45923]","[Q188473, Q1146335, Q157443, Q1535153]",19 wins & 61 nominations.,3.9,34.25
1,"[Q51583, Q51472]","[Q34851, Q151973, Q181887, Q283988]","[Q645928, Q369747, Q1054574, Q130232, Q1433443]",Won 4 Oscars. Another 2 wins & 13 nominations.,3.4,25.25
2,[Q456008],"[Q42204, Q37876, Q313042, Q200405]","[Q2484376, Q20443008, Q3990883, Q622291, Q7210...",7 wins & 27 nominations.,3.9,13.75
3,[Q188137],"[Q108941, Q81328, Q210447, Q4678990]","[Q471839, Q319221, Q188473, Q157394]",Nominated for 5 Oscars. Another 57 wins & 123 ...,4.3,102.75
4,[Q8877],"[Q258064, Q354873, Q506198, Q676094]","[Q471839, Q2143665, Q130232]",Won 4 Oscars. Another 47 wins & 34 nominations.,3.5,75.50
5,[Q387412],"[Q7516, Q13909, Q461238, Q29086]","[Q1146335, Q580850, Q19367312, Q959790, Q2484376]",,3.2,0.00
6,[Q282033],"[Q81520, Q191104, Q230055, Q190794]","[Q3990883, Q2297927, Q188473]",Nominated for 2 BAFTA Film Awards. Another 2 w...,3.4,3.50
7,[Q51552],"[Q37175, Q228862, Q310944, Q234890]","[Q1200678, Q590103, Q2484376, Q2973181, Q52162...",1 win & 4 nominations.,3.2,2.00
8,"[Q25078, Q166159]","[Q24962, Q25014, Q25078, Q166159]","[Q157394, Q319221, Q542475, Q157443, Q622548, ...",2 wins & 2 nominations.,4.1,2.50
9,[Q1320930],"[Q234516, Q1378118, Q383930, Q238843]","[Q860626, Q1747837, Q1146335, Q224700, Q134105...",1 win & 10 nominations.,3.8,3.50


In [None]:
# director = to_number(data['director'])

In [None]:
# cast_member = to_number(data['cast_member'])

In [None]:
genre = to_number(data['genre'])
# this genre does not make much sense. It has 261 different types. We could use genres in OMDB instead
# Or..wait, this can be useful as well.

In [38]:
def combine_list(x,y):
    #combine two 2d lists together
    for i in range(len(x)):
        x[i].extend(y[i])
    return x
# xxxx = [[1,2]]
# yyyy = [[3,4]]
# aaaaa = combine_list(xxxx,yyyy)
# aaaaa

In [39]:
def array_to_list(x):
    # transfer array to list
    list = [[] for i in range(x.shape[0])]
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            list[i].append((x[i][j]))
    return list

In [48]:
len(X)

5288

In [49]:
len(X[0])

5829

In [52]:
len(director)

5288

In [53]:
len(cast_member)

6858

In [44]:
X = combine_list(director, cast_member)

In [None]:
X = combine_list(X, genre)

In [45]:
y = np.asarray(data['weighted_points'], dtype="|S6")

In [65]:
len(X)

5288

In [66]:
len(y)

7510

In [46]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y)

ValueError: Found input variables with inconsistent numbers of samples: [5288, 7510]

In [47]:
model_2 = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(25,4))

In [None]:
model_2.fit(X_train_2, y_train_2)
model_2.predict(X_test_2)

In [None]:
def model_test(model,X_test,y_test,a):
    # Using model "model" and input data "X_test", "y_test"
    accept = 0
    real = np.asarray(y_test, dtype="float64")
    predict = np.asarray(model.predict(X_test), dtype="float64")
    difference = np.absolute(real - predict)
    for i in range(real.size):
        if difference[i] <= a:
            accept = accept + 1
    score = accept/difference.size
    print ('The test score is: ', score)
    return 0
model_test(model_2, X_test_2,y_test_2,3)