In [1]:
import pandas as pd
import numpy as np
import plotly
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.figure_factory as ff
init_notebook_mode(connected=True)

In [2]:
import requests
from bs4 import BeautifulSoup
import re
from pandas.io.json import json_normalize
import json
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import dash_table

In [3]:
def get_page(url):
    response = requests.get(url)    #make the HTTP requests
    try:
        if response.status_code == 200:  #code 200 indicates a successful request
            return response              #return the response content
        else:
            return None
    except RequestException as e:       #if the request is not successful, print out the exceptions content
        print('Requests Failed: '+str(e))

In [4]:
def webscrapping(link, meta, year):
    dataset = pd.DataFrame(columns = meta)
    page = get_page(link)
    if page is not None:
        soup = BeautifulSoup(page.text,'lxml')
        df = soup.find('div',attrs={'class':'article'})
        films = df.find_all('script')
        text = re.findall('(\{\"awards\".*?)\,\"alwaysDisplayAwardNames\"',films[1].text)
        data = json.loads(text[0])['awards']

        for i in range(len(data)):
            newdata = json_normalize(data[i]['categories'],record_path='nominations')
            newdf = pd.DataFrame.from_dict(newdata)
            newdf.insert(0,'nconst','')
            newdf.insert(0,'tconst','')
            for j in newdf.index:
                if len(newdf.loc[j,'primaryNominees'])>0:
                    newdf.loc[j,'tconst']= newdf.loc[j,'primaryNominees'][0]['const']
                    newdf.loc[j,'primaryNominees'] = newdf.loc[j,'primaryNominees'][0]['name']
                if len(newdf.loc[j,'secondaryNominees'])>0:
                    newdf.loc[j,'nconst']= newdf.loc[j,'secondaryNominees'][0]['const']
                    newdf.loc[j,'secondaryNominees'] = newdf.loc[j,'secondaryNominees'][0]['name']
            newdf.insert(0,'Year',[year]*len(newdf))
            newdf = newdf[meta]
            dataset = dataset.append(newdf)
    
    return dataset

In [5]:
def newFeature(df, fName):
    copy = df.copy()
    names = []

    for i in copy.index:
        if copy.loc[i,'awardName'] not in fName:
            film = copy.loc[i,'tconst']
            if len(copy[copy['tconst']==film])>1:
                if df.loc[i,'isWinner']==True:
                    names.append(film)
                df.drop(i,axis=0,inplace=True)
            else:
                df.loc[i,'isWinner']=False
                df.loc[i,'otherAwards']=True

    for i in df.index:
        if df.loc[i,'tconst'] in names:
            df.loc[i,'otherAwards'] = True
            
    return df

In [6]:
def ManualGenreFix(dataset,index,newgenre):
    dataset.loc[index,'Genre1']=newgenre

def checkDuplicates(dataset,bigAward):
    dataset.insert(4,'otherAwards',False)
    major = []
    other = []
    for i in dataset.index:
        if dataset.loc[i,'isWinner']==True:
            if dataset.loc[i,'awardName'] in bigAward:
                major.append(dataset.loc[i,'tconst'])
            else:
                other.append(dataset.loc[i,'tconst'])
    dataset.set_index('tconst',inplace=True)
    newdata = dataset[dataset.index.duplicated(keep='first')==False]
    for i in major:
        newdata.loc[i,'isWinner']=True
    for i in other:
        newdata.loc[i,'otherAwards']=True   
    return newdata

In [7]:
def update_dataset(dataset,others):
    dataset = dataset.copy()
    feature = 'Genre1'
    
    for i in dataset.index:
        if dataset.loc[i,feature] in others:
            dataset.loc[i,feature] = 'Others'
    return dataset

def update_dataset_new(dataset,fdata,festival,otherGenre):
    newdata = dataset.copy()
    newdata['Genre_'+festival]=''
    year = fdata.Year.min()-1
    for i in newdata.index:
        if newdata.loc[i,'releaseYear']>year and i not in fdata.index:
            if newdata.loc[i,'Genre1'] in fdata['Genre1'].unique():
                newdata.loc[i,'Genre_'+festival] = newdata.loc[i,'Genre1']
            elif newdata.loc[i,'Genre1'] in otherGenre[festival]:
                newdata.loc[i,'Genre_'+festival] = 'Others'
    return newdata

In [8]:
def splitDataset(file,genre):
    s = file[file['Genre1']==genre]
    swinner = s[(s['isWinner']==True) | (s['otherAwards']==True)]
    sloser = s[(s['isWinner']==False) & (s['otherAwards']==False)]
    return s, swinner, sloser

In [9]:
def contenders(file,contenders):
    newfile = file.copy()
    indexes = [i for i in newfile.index if i not in contenders.index]
    newfile.drop(indexes,axis=0,inplace=True)
    return newfile

In [10]:
def outliers(file,feature):
    Q1 = file[feature].quantile(0.25)
    Q3 = file[feature].quantile(0.75)
    IQR = Q3-Q1
    outlier = file[(file[feature]>Q3+1.5*IQR)&(file[feature]<Q1-1.5*IQR)]
    removed = file[(file[feature]<=Q3+1.5*IQR)&(file[feature]>=Q1-1.5*IQR)]
    return outlier,removed

In [11]:
def permutation(file1,file2,feature):
    n1 = len(file1)
    observed = file1[feature].mean() - file2[feature].mean()
    samples = file1.append(file2)[feature]

    rand_mean_diffs = []
    for i in range(10000):
        permuted = np.random.permutation(samples)
        rand_mean1 = permuted[:n1].mean()
        rand_mean2 = permuted[n1:].mean()
        rand_mean_diffs.append(rand_mean1-rand_mean2)
    
    if observed>0:
        p_value = sum(np.array(rand_mean_diffs)>=observed)/len(rand_mean_diffs)
    else:
        p_value = sum(np.array(rand_mean_diffs)<=observed)/len(rand_mean_diffs)

    return p_value

In [12]:
def cal_mean(dataset,feature,r=0):
    mean = round(np.nanmean(dataset[feature]),r)
    return mean

def to_dollar(num):
    if np.isnan(num)==False:
        dollar = '$'+format(int(num),',d')
    else:
        dollar = 'N/A'
    return dollar

def cal_change(data1,data2):
    percent = round(((data1-data2)/data2)*100,2)
    if np.isnan(percent)==False:
        percent = '{:.6}%'.format(percent)
    else:
        percent = 'N/A'
    return percent

In [13]:
def calculation(dataset,festival_name,options,newbo):
    boxoffice = {}
    percent = {}
    pvalue = {}
    for genre in options:
        s, s_winner, s_loser = splitDataset(dataset,genre)
        allfilm = newbo[newbo['Genre_'+festival_name]==genre]          
        
        feature = 'AdjustedTotalGross'
        allfilm_outlier, allfilm_new = outliers(allfilm,feature)
        s_outlier, s_new = outliers(s,feature)
        s_winner_new = contenders(s_winner,s_new)
        s_loser_new = contenders(s_loser,s_new)

        allmean = cal_mean(allfilm_new,feature)
        smean = cal_mean(s_new,feature)
        slmean = cal_mean(s_loser_new,feature)
        swmean = cal_mean(s_winner_new,feature)
        
        pvalue1 = permutation(s_new,allfilm_new,feature)
        pvalue2 = permutation(s_winner_new,allfilm_new,feature)
        pvalue3 = permutation(s_loser_new,allfilm_new,feature)
        
        boxoffice[genre]=[to_dollar(allmean),to_dollar(smean),to_dollar(slmean),to_dollar(swmean)]
        percent[genre]=[cal_change(smean,allmean),cal_change(slmean,allmean),cal_change(swmean,allmean)]
        pvalue1 = pvalue1 if percent[genre][0] != 'N/A' else 'N/A'
        pvalue2 = pvalue2 if percent[genre][2] != 'N/A' else 'N/A'
        pvalue3 = pvalue3 if percent[genre][1] != 'N/A' else 'N/A'
        pvalue[genre]=[pvalue1,pvalue3,pvalue2]
        
    dic = {}
    dic['boxoffice']= boxoffice
    dic['percent']= percent
    dic['pvalue']= pvalue
    
    neg_sigs=[]
    pos_sigs=[]
    columnid = ['contendersdiff','losersdiff','winnersdiff']
    percentvalues = list(percent.values())
    values = list(pvalue.values())
    for i in range(len(pvalue)):
        for j in range(3):
            if values[i][j] != 'N/A' and values[i][j]<0.05:
                if '-' in percentvalues[i][j]:
                    neg_sigs.append([i,columnid[j]])
                else:
                    pos_sigs.append([i,columnid[j]])
    dic['pos_sigs']=pos_sigs
    dic['neg_sigs']=neg_sigs

    return dic

In [14]:
def addSigs(json_path):
    with open(json_path,'r') as json_file:
        combine = json.load(json_file)
    for festival in combine:
        neg_sigs=[]
        pos_sigs=[]
        columnid = ['contendersdiff','losersdiff','winnersdiff']
        percent = combine[festival]['percent']
        pvalue = combine[festival]['pvalue']
        values = list(pvalue.values())
        percentvalues = list(percent.values())
        for i in range(len(values)):
            for j in range(3):
                if values[i][j]<0.05:
                    if '-' in percentvalues[i][j]:
                        neg_sigs.append([i,columnid[j]])
                    else:
                        pos_sigs.append([i,columnid[j]])
        combine[festival]['pos_sigs']=pos_sigs
        combine[festival]['neg_sigs']=neg_sigs
    with open(json_path,'w') as outfile:
        json.dump(combine,outfile)
    return combine

In [15]:
def updateJSON(json_path,newdata,name):
    with open(json_path,'r') as json_file:
        newjson = json.load(json_file)
    newjson[name]=newdata
    with open(json_path,'w') as outfile:
        json.dump(newjson,outfile)
    return newjson

In [16]:
def updateCombineData(table,name):
    with open('combinedata.json','r') as json_file:
        combine = json.load(json_file)
    
    c = generate_json(table,table.shape[0])
    combine[name] = c
    
    with open('combinedata.json', 'w') as outfile:
        json.dump(combine, outfile)

In [17]:
def preprocess(dataset,otherGenre,name):
    try:
        dataset.drop('Unnamed: 0',axis=1,inplace=True)
        dataset.to_csv(url,encoding='utf-8',index=True)
    except:
        pass
    dataset = update_dataset(dataset,otherGenre[name])
    dataset.to_csv(name+'_updated_new.csv',encoding='utf-8',index=True)
    newbo = pd.read_csv('filmDataset_forDashBoard.csv',index_col='tconst')
    newbo = update_dataset_new(newbo,dataset,name,otherGenre)
    newbo.to_csv('filmDataset_forDashBoard.csv',encoding='utf-8',index=True)
    dataset = dataset[['Year', 'awardName', 'categoryName', 'isWinner', 'otherAwards','primaryNominees', 'nconst', 
                       'secondaryNominees','Name', 'releaseYear','Genre1','AdjustedTotalGross']]
    dataset.to_csv(name+'_data.csv',encoding='utf-8',index=True)
    return dataset, newbo

In [18]:
def generate_json(dataset,n):
    b = {}
    pe = {}
    pv = {}

    for i in range(1,n):
        b[dataset.iloc[i,0]] = [to_dollar(i) for i in dataset.iloc[i,[1,2,5,8]]]
        pe[dataset.iloc[i,0]] = [i for i in dataset.iloc[i,[3,6,9]]]
        pv[dataset.iloc[i,0]] = [float(i) for i in dataset.iloc[i,[4,7,10]]]

    c = {}
    c['boxoffice'] = b
    c['percent'] = pe
    c['pvalue'] = pv
    
    neg_sigs=[]
    pos_sigs=[]
    columnid = ['contendersdiff','losersdiff','winnersdiff']
    values = list(pv.values())
    percentvalues = list(pe.values())
    for i in range(len(values)):
        for j in range(3):
            if values[i][j] != 'N/A' and values[i][j] <0.05:
                if '-' in percentvalues[i][j]:
                    neg_sigs.append([i,columnid[j]])
                else:
                    pos_sigs.append([i,columnid[j]])
    c['pos_sigs']=pos_sigs
    c['neg_sigs']=neg_sigs
    return c

In [19]:
def scraper_preprocess_pipeline(event_url,festival_name,start_year,end_year,big_award):
    #scraper
    meta=['Year','awardName','categoryName','isWinner','tconst','primaryNominees','nconst','secondaryNominees']
    nominees = pd.DataFrame(columns=meta)

    for i in range(start_year,end_year):
        link = 'https://www.imdb.com/event/'+event_url+str(i)+'/1/?ref_=ev_eh'
        nominees = nominees.append(webscrapping(link, meta, i))
    nominees.to_csv(festival_name+'FestivalNominees.csv', encoding='utf-8', index=False)
    
    #preprocessing
    newboxoffice = pd.read_csv('films_dataset.csv')
    dataset = pd.merge(nominees,newboxoffice,on='tconst')
    dataset['Genre1'].fillna(value='Unknown',inplace=True)
    dataset.to_csv(festival_name+'NomineesBoxOffice_updated.csv',encoding='utf-8', index=True)
    
    dataset = checkDuplicates(dataset,big_award)
    
    #process
    options = list(dataset.Genre1.value_counts()[dataset.Genre1.value_counts()>50].index)
    if len(options)<2:
        if festival_name=='Rio':
            options = list(dataset.Genre1.value_counts().index)[:1]
        else:
            options = list(dataset.Genre1.value_counts().index)[:2]
        print('data too small for each genre in '+festival_name)
    options = options+['Others']
    all_options = updateJSON('allOptions.json',options,festival_name)
    others = [i for i in list(dataset.Genre1.unique()) if i not in options]
    otherGenre = updateJSON('otherGenre.json',others,festival_name)
    dataset, newbo = preprocess(dataset,otherGenre,festival_name)
    statics = calculation(dataset,festival_name,options,newbo)
    combine = updateJSON('combinedata.json',statics,festival_name)

In [20]:
def pipeline(dataset_path, table_path, festival_name):
    table = pd.read_csv(table_path,index_col=False)
    n = table.shape[0]
    dataset = pd.read_csv(dataset_path,index_col='tconst')
    options = list(table.Genre)[1:]
    others = [i for i in list(dataset.Genre1.unique()) if i not in options] 
    all_options = updateJSON('allOptions.json',options,festival_name)
    otherGenre = updateJSON('otherGenre.json',others,festival_name)
    updateCombineData(table,festival_name)
    preprocess(dataset,otherGenre,festival_name)

In [None]:
pipeline('locarno_boxoffice.csv','locarno_table.csv','Locarno')
pipeline('sitges_boxoffice.csv','sitges_table.csv','Sitges')

In [74]:
#Sundance
scraper_preprocess_pipeline('ev0000631/','Sundance',1982,2020,['Grand Jury Prize'])

#Tribeca
scraper_preprocess_pipeline('ev0000894/','Tribeca',2002,2020,['Jury Award'])

#Berlin
scraper_preprocess_pipeline('ev0000091/','Berlin',1951,2020,['Golden Berlin Bear'])

#Chicago
scraper_preprocess_pipeline('ev0000165/','Chicago',1965,2019,['Gold Hugo'])

#Rotterdam
scraper_preprocess_pipeline('ev0000569/','Rotterdam',1978,2020,['Tiger Award'])

#Venice
scraper_preprocess_pipeline('ev0000681/','Venice',1932,2019,['Golden Lion'])

#Seattle
scraper_preprocess_pipeline('ev0000600/','Seattle',1985,2020,['Golden Space Needle Award'])

#San Francisco
scraper_preprocess_pipeline('ev0000584/','SanFrancisco',1957,2020,['Golden Gate Award'])

#Slamdance
scraper_preprocess_pipeline('ev0000612/','Slamdance',1996,2020,['Grand Jury Prize'])

#Locarno
scraper_preprocess_pipeline('ev0000400/','Locarno',1946,2020,['Golden Leopard'])

#Sitges
scraper_preprocess_pipeline('ev0000155/','Sitges',1971,2019,['Best Film'])

#Toronto
scraper_preprocess_pipeline('ev0000659/','Toronto',1978,2019,["People's Choice Award"])

#KarlovyVary
scraper_preprocess_pipeline('ev0000384/','KarlovyVary',1948,2020,['Crystal Globe'])

#HongKong
scraper_preprocess_pipeline('ev0000331/','HongKong',1998,2020,['Golden Firebird Award'])

#Austin
scraper_preprocess_pipeline('ev0000057/','Austin',1995,2019,['Feature Film Award'])

#Torino
scraper_preprocess_pipeline('ev0003555/','Torino',1997,2019,['Prize of the City of Torino'])

#Marrakech
scraper_preprocess_pipeline('ev0000874/','Marrakech',2001,2019,['Golden Star'])

data too small for each genre in Rotterdam
data too small for each genre in SanFrancisco
data too small for each genre in Slamdance
data too small for each genre in HongKong
data too small for each genre in Austin
data too small for each genre in Torino
data too small for each genre in Marrakech


In [72]:
#Tokyo
scraper_preprocess_pipeline('ev0000655/','Tokyo',1985,2019,['Tokyo Grand Prix'])

In [67]:
#GoldenHorse
scraper_preprocess_pipeline('ev0000293/','GoldenHorse',1962,2019,['Golden Horse Award'])

In [68]:
#BuenosAires
scraper_preprocess_pipeline('ev0000138/','BuenosAires',1999,2020,['Best Film'])

data too small for each genre in BuenosAires


In [69]:
#Gramado
scraper_preprocess_pipeline('ev0000300/','Gramado',1973,2020,['Golden Kikito'])

data too small for each genre in Gramado


In [75]:
#Cairo
scraper_preprocess_pipeline('ev0000141/','Cairo',1977,2019,['Golden Pyramid'])

data too small for each genre in Cairo


In [71]:
#Havana
scraper_preprocess_pipeline('ev0000314/','Havana',1979,2019,['Grand Coral - First Prize'])

data too small for each genre in Havana


In [98]:
#Rio
scraper_preprocess_pipeline('ev0001488/','Rio',1999,2019,['Première Brazil'])

data too small for each genre in Rio


In [96]:
#SaoPaulo
scraper_preprocess_pipeline('ev0000638/','SaoPaulo',1977,2019,['Audience Award'])

In [23]:
#Asia-Pacific
scraper_preprocess_pipeline('ev0000045/','AsiaPacific',1954,2019,['APFF Award'])

data too small for each genre in AsiaPacific



Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [24]:
#India
scraper_preprocess_pipeline('ev0001480/','India',1987,2019,['Golden Peacock'])

data too small for each genre in India


In [28]:
#Sydney
scraper_preprocess_pipeline('ev0000637/','Sydney',1974,2020,['Sydney Film Prize'])

data too small for each genre in Sydney



Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [26]:
#Beijing
scraper_preprocess_pipeline('ev0002781/','Beijing',2010,2020,['Tiantian Award'])

data too small for each genre in Beijing


In [27]:
#TokyoF
scraper_preprocess_pipeline('ev0001602/','TokyoF',2000,2019,['Grand Prize'])

data too small for each genre in TokyoF


In [45]:
#AAFCA
scraper_preprocess_pipeline('ev0002165/','AAFCA',2003,2019,['AAFCA Award'])


Mean of empty slice.


invalid value encountered in less_equal



In [35]:
#Brisbane
scraper_preprocess_pipeline('ev0000121/','Brisbane',1993,2019,['Audience Award'])

data too small for each genre in Brisbane



Mean of empty slice.


invalid value encountered in double_scalars


Mean of empty slice.


invalid value encountered in less_equal



In [21]:
#Jerusalem
scraper_preprocess_pipeline('ev0000375/','Jerusalem',1984,2020,['In Spirit for Freedom Award'])

data too small for each genre in Jerusalem



Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [22]:
#Haifa
scraper_preprocess_pipeline('ev0000309/','Haifa',1996,2020,['Golden Anchor Award'])

data too small for each genre in Haifa


In [23]:
#GrandBell
scraper_preprocess_pipeline('ev0000925/','GrandBell',1962,2019,['Grand Bell Award'])

In [21]:
#Fajr
scraper_preprocess_pipeline('ev0000747/','Fajr',1985,2020,['Crystal Simorgh'])

data too small for each genre in Fajr



Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [22]:
#Singapore
scraper_preprocess_pipeline('ev0000610/','Singapore',1990,2019,['Silver Screen Award'])

data too small for each genre in Singapore


In [23]:
#Yamagata
scraper_preprocess_pipeline('ev0000883/','Yamagata',1989,2016,['Robert and Frances Flaherty Prize'])

data too small for each genre in Yamagata


In [24]:
#Shanghai
scraper_preprocess_pipeline('ev0000605/','Shanghai',1993,2020,['Golden Goblet'])

In [101]:
combine = addSigs('combinedata.json')

In [30]:
toronto = pd.read_csv('TorontoNomineesBoxOffice_updated.csv')

In [36]:
    with open('combinedata.json','r') as json_file:
        options = json.load(json_file)

In [90]:
    with open('combinedata.json','w') as outfile:
        json.dump(options,outfile)

In [89]:
del options['Cario']

In [32]:
new = pd.read_csv('films_dataset.csv')
nominees = pd.read_csv('GoldenHorseFestivalNominees.csv')

In [45]:
dataset = pd.merge(nominees,new,on='tconst')

In [31]:
round(1.508e+03,2)

1508.0

In [33]:
'{:.6}%'.format(round(1.508e+03,2))

'1508.0%'

In [46]:
dataset.set_index('tconst',inplace=True)

In [37]:
options

{'Sundance': {'boxoffice': {'Drama': ['$659,491',
    '$754,727',
    '$659,484',
    '$905,996'],
   'Documentary': ['$78,168', '$279,523', '$267,866', '$295,371'],
   'Comedy': ['$2,206,181', '$2,820,191', '$1,984,729', '$3,951,767'],
   'Biography': ['$2,469,243', '$1,105,821', '$1,112,239', '$1,096,968'],
   'Crime': ['$2,546,654', '$2,133,397', '$1,737,939', '$2,561,810'],
   'Others': ['$6,979,926', '$2,268,814', '$1,597,432', '$2,982,157']},
  'percent': {'Drama': ['14.44%', '-0.001061%', '37.38%'],
   'Documentary': ['257.6%', '242.7%', '277.9%'],
   'Comedy': ['27.83%', '-10.04%', '79.12%'],
   'Biography': ['-55.22%', '-54.96%', '-55.57%'],
   'Crime': ['-16.23%', '-31.76%', '0.5951%'],
   'Others': ['-67.5%', '-77.11%', '-57.28%']},
  'pvalue': {'Drama': [0.0855, 0.5109, 0.0168],
   'Documentary': [0.0, 0.0, 0.0],
   'Comedy': [0.0126, 0.2759, 0.0002],
   'Biography': [0.0015, 0.0131, 0.0291],
   'Crime': [0.2753, 0.1871, 0.4598],
   'Others': [0.0064, 0.0209, 0.1021]},
  'p