In [1]:
import numpy as np
import pandas as pd
import requests # to make TMDB API calls
import locale # to format currency as USD
locale.setlocale( locale.LC_ALL, '' )
api_key = '07c743da178e65cad9e43897e89fc202'
import ast
import json

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
dict_columns = ['belongs_to_collection', 'genres', 'production_companies',
                'production_countries', 'spoken_languages', 'Keywords', 'cast', 'crew']

def text_to_dict(df):
    for column in dict_columns:
        df[column] = df[column].apply(lambda x: {} if pd.isna(x) else ast.literal_eval(x) )
    return df

In [4]:
final_cols = ['popularity','iscollect', 'budget']

train['iscollect'] = train['belongs_to_collection'].notnull().astype('int')
train['revenue'] = np.log(train['revenue'])

test['iscollect'] = test['belongs_to_collection'].notnull().astype('int')

In [5]:
genres = train['genres']
unique_genres = set()
for gen in genres:
    try:
        names = json.loads(gen.replace("'",'"'))
        name_list = [name['name'] for name in names]
        unique_genres.update(name_list)
    except:
        pass
    
train = train.reindex(list(unique_genres)+list(train.columns),fill_value=0,axis='columns')

for i, gen in enumerate(genres):
    try:
        names = json.loads(gen.replace("'",'"'))
        name_list = [name['name'] for name in names]
        for n in name_list:
            train.at[i,n] = 1
    except:
        pass

    
test = test.reindex(list(unique_genres)+list(test.columns),fill_value=0,axis='columns')

for i, gen in enumerate(genres):
    try:
        names = json.loads(gen.replace("'",'"'))
        name_list = [name['name'] for name in names]
        for n in name_list:
            test.at[i,n] = 1
    except:
        pass

In [6]:
sig_diff = []
for g in unique_genres:
    gen_rev = train.loc[train[g]==1,'revenue']
    nongen_rev = train.loc[train[g]==0,'revenue']
    diff = np.mean(gen_rev)-np.mean(nongen_rev)
    if abs(diff) > 0.4 and len(gen_rev) > 30:
        sig_diff.append(g)
final_cols.extend(sig_diff)

In [7]:
final_cols

['popularity',
 'iscollect',
 'Drama',
 'Adventure',
 'Fantasy',
 'Western',
 'Thriller',
 'Animation',
 'Action',
 'Mystery',
 'Foreign',
 'Science Fiction',
 'Documentary',
 'Family']

In [36]:
train['release_date'] = pd.to_datetime(train['release_date'],format='%m/%d/%y').astype('str')
test['release_date'] = pd.to_datetime(test['release_date'],format='%m/%d/%y').astype('str')

In [37]:
train = text_to_dict(train)
test = text_to_dict(test)

In [38]:
def getRev(row):
    response = requests.get('https://api.themoviedb.org/3/find/'+ row.imdb_id + '?api_key=' + api_key +'&language=en-US&external_source=imdb_id')
    if 'movie_results' in response.json().keys():
        if len(response.json()['movie_results']) >= 1:
            tmdbid = response.json()['movie_results'][0]['id']
        else:
            return 0
    else:
        return 0
    response = requests.get('https://api.themoviedb.org/3/movie/' + str(tmdbid) + '?api_key=' +  api_key + '&language=en-US')
    if 'revenue' in response.json().keys():
        return response.json()['revenue']
    else: 
        return 0

revenues = test.apply(lambda row: getRev(row), axis = 1)
test['revenue'] = np.log(revenues)



In [41]:
def actorReturn(row):
    if type(row.cast) == list:
        if len(row.cast) >= 2:
            actors = {
                "actor1": row.cast[0]['name'],
                "actor2": row.cast[1]['name']
            }
            return actors
        elif len(row.cast) == 1:
            actors = {
                "actor1": row.cast[0]['name'],
            }
            return actors
        else:
                    return {}
    else:
        return {}
    
def returnCrew(crew):
    crewDict = {
        'Director': None,
        'Producer': None
    }
    direc = 0
    prod = 0
    for x in crew:
        #print(crewDict)
        if x['job'] == 'Director':
            crewDict['Director'] = x['name']
            direc = 1
        elif x['job'] == 'Producer':
            crewDict['Producer'] = x['name']
            prod = 1
        if direc > 0 and prod > 0:
            return crewDict
    return crewDict

In [42]:
train['rel_actors'] = train.apply(lambda row: actorReturn(row), axis =1) 
train['rel_crew'] = train.apply(lambda row: returnCrew(row.crew) if type(row.crew) == list else {},  axis =1)
test['rel_actors'] = test.apply(lambda row: actorReturn(row), axis =1) 
test['rel_crew'] = test.apply(lambda row: returnCrew(row.crew) if type(row.crew) == list else {},  axis =1)

In [44]:
both = train.append(test, sort=False)
workerPerformance = pd.DataFrame(columns=['revenue', 'Director', 'date', 'Producer', 'actor1','actor2'])
workerPerformance['revenue'] = both.apply(lambda row: row.revenue if 'Director' in row.rel_crew.keys() else  None, axis =1) 
workerPerformance['Director'] = both.apply(lambda row: row.rel_crew['Director'] if 'Director' in row.rel_crew.keys() else  None, axis =1) 
workerPerformance['date'] = both.apply(lambda row: str(row.release_date)[:4] if 'Director' in row.rel_crew.keys() else  None, axis =1) 
workerPerformance['Producer'] = both.apply(lambda row: row.rel_crew['Producer'] if 'Producer' in row.rel_crew.keys() else  None, axis =1) 
workerPerformance['actor1'] = both.apply(lambda row: row.rel_actors['actor1'] if 'actor1' in row.rel_actors.keys() else  None, axis =1) 
workerPerformance['actor2'] = both.apply(lambda row: row.rel_actors['actor2'] if 'actor2' in row.rel_actors.keys() else  None, axis =1) 

frame = { 'Director': workerPerformance['Director'],
         'Producer': workerPerformance['Producer'],
         'actor1': workerPerformance['actor1'],
         'actor2': workerPerformance['actor2'],
         'revenue': workerPerformance['revenue'],
         'date': workerPerformance['date'] } 

workerResults = pd.DataFrame(frame) 

In [45]:
directorsAndMovies = {}
def directorLists(row):
    if row.Director == None:
        return 0
    if row.date == 'nan':
        return 0
    if row.Director in directorsAndMovies.keys(): 
        directorsAndMovies[row.Director].append([int(row.date), row.revenue])
    else:
        directorsAndMovies[row.Director] = [[int(row.date), row.revenue]]
    return 1
workerResults.apply(lambda row: directorLists(row), axis = 1);

In [46]:
producersAndMovies = {}
def producersLists(row):
    if row.Producer == None:
        return 0
    if row.date == 'nan':
        return 0
    if row.Producer in producersAndMovies.keys(): 
        producersAndMovies[row.Producer].append([int(row.date), row.revenue])
    else:
        producersAndMovies[row.Producer] = [[int(row.date), row.revenue]]
    return 1
workerResults.apply(lambda row: producersLists(row), axis = 1);

In [47]:
actorsAndMovies = {}
def actorsLists(row):
    boolOnlyOne = 0
    if row.actor2 == None and row.actor1 == None:
        return 0
    elif row.actor2 == None:
        boolOnlyOne = 1
    if row.date == 'nan':
        return 0
    if row.date == None:
        return 0
    if row.actor1 in actorsAndMovies.keys(): 
        actorsAndMovies[row.actor1].append([int(row.date), row.revenue])
    else:
        actorsAndMovies[row.actor1] = [[int(row.date), row.revenue]]
    if boolOnlyOne != 1:
        if row.actor2 in actorsAndMovies.keys(): 
            actorsAndMovies[row.actor2].append([int(row.date), row.revenue])
        else:
            actorsAndMovies[row.actor2] = [[int(row.date), row.revenue]]
    return 1
workerResults.apply(lambda row: actorsLists(row), axis = 1);

In [48]:
def Sort(sub_li): 
  
    # reverse = None (Sorts in Ascending order) 
    # key is set to sort using second element of  
    # sublist lambda has been used 
    return(sorted(sub_li, key = lambda x: x[0]))   

for key in directorsAndMovies:
    directorsAndMovies[key] = Sort(directorsAndMovies[key])
    
for key in producersAndMovies:
    producersAndMovies[key] = Sort(producersAndMovies[key])
    
for key in actorsAndMovies:
    actorsAndMovies[key] = Sort(actorsAndMovies[key])

In [49]:
def getSuccess(name, relDict, date):
    relevantList = []
    if name == None:
        return 0
    if name not in relDict.keys():
        print("error: " + name)
        return 0
    for movie in relDict[name]:
        if int(movie[0]) < date:
            relevantList.append(movie[1])
        else:
            break
        break
    if len(relevantList) == 0:
        return 0
    return np.mean(relevantList)

In [50]:
train['director_success'] = train.apply(lambda row: getSuccess(row.rel_crew['Director'], directorsAndMovies, int(row.release_date[:4])) if row.release_date != 'nan' and 'Director' in row.rel_crew.keys() and type(row.release_date) == str else 0, axis = 1)
train['producer_success'] = train.apply(lambda row: getSuccess(row.rel_crew['Producer'], producersAndMovies, int(row.release_date[:4])) if row.release_date != 'nan' and 'Producer' in row.rel_crew.keys() and type(row.release_date) == str else 0, axis = 1)
train['actor1_success'] = train.apply(lambda row: getSuccess(row.rel_actors['actor1'], actorsAndMovies, int(row.release_date[:4])) if row.release_date != 'nan' and 'actor1' in row.rel_actors.keys() and type(row.release_date) == str else 0, axis = 1)
train['actor2_success'] = train.apply(lambda row: getSuccess(row.rel_actors['actor2'], actorsAndMovies, int(row.release_date[:4])) if row.release_date != 'nan' and 'actor2' in row.rel_actors.keys() and type(row.release_date) == str else 0, axis = 1)

test['director_success'] = test.apply(lambda row: getSuccess(row.rel_crew['Director'], directorsAndMovies, int(row.release_date[:4])) if row.release_date != 'nan' and 'Director' in row.rel_crew.keys() and type(row.release_date) == str else 0, axis = 1)
test['producer_success'] = test.apply(lambda row: getSuccess(row.rel_crew['Producer'], producersAndMovies, int(row.release_date[:4])) if row.release_date != 'nan' and 'Producer' in row.rel_crew.keys() and type(row.release_date) == str else 0, axis = 1)
test['actor1_success'] = test.apply(lambda row: getSuccess(row.rel_actors['actor1'], actorsAndMovies, int(row.release_date[:4])) if row.release_date != 'nan' and 'actor1' in row.rel_actors.keys() and type(row.release_date) == str else 0, axis = 1)
test['actor2_success'] = test.apply(lambda row: getSuccess(row.rel_actors['actor2'], actorsAndMovies, int(row.release_date[:4])) if row.release_date != 'nan' and 'actor2' in row.rel_actors.keys() and type(row.release_date) == str else 0, axis = 1)

error: Kirby Heyborne
error: Erik Goertz
error: Michael Flynn
error: Anton Rattinger
error: –Æ–ª–∏—è –í–æ–ª–∫–æ–≤–∞
error: Javed Sheikh
error: Warren Miller
error: Mikhail Efremov
error: Fahad Mustafa
error: Kathleen Barr


In [None]:
train['budget'] = np.log(train['budget'])
test['budget'] = np.log(test['budget'])

In [51]:
final_cols.extend(['director_success','producer_success','actor1_success','actor2_success'])

In [52]:
train = train[final_cols + ['revenue']]
test = test[final_cols]

In [59]:
train = train.replace([np.inf, -np.inf], 0)
test = test.replace([np.inf, -np.inf], 0)

In [None]:
train.to_csv('preptrain.csv', index=False)
test.to_csv('preptest.csv', index=False)

In [13]:
tt = pd.read_csv('preptest.csv')

TypeError: parser_f() got an unexpected keyword argument 'index'

In [20]:
tt.drop(column='Unnamed: 0',axis=1)

TypeError: drop() got an unexpected keyword argument 'column'