In [None]:
import requests
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from nltk.tree import Tree
import xml.etree.ElementTree as ET
import itertools
import matplotlib.dates as mdates

from load_data import *
pd.options.mode.chained_assignment = None

In [None]:
#download_data()

# Load plot summaries
# plot_df = load_plot_df()

# Load movie metadata
movie_df = load_movie_df()

# Load character metadata
char_df = load_char_df()

# Load name clusters
names_df = load_names_df()

# Load tv tropes
cluster_df = load_cluster_df()

### First graph

In [None]:
# Get relaase date by decade
movie_df = movie_df[~movie_df['Release date'].isna()]
movie_df['Release date year'] = movie_df['Release date'].apply(lambda x: int(str(x)[:4]))
movie_df['Release date decade'] = movie_df['Release date year'].apply(lambda x: int(x/10)*10)
movie_df['Release date decade'].value_counts()

In [None]:
#romance_genres = ['Romantic comedy', 'Romance Film', 'Romantic drama', 'Romantic fantasy', 'Romantic thriller']
#romance_genres = ['Drama', 'Comedy', 'Romance Film', 'Black-and-white', 'Action', 'Thriller', 'Short Film', 'World cinema', 'Crime Fiction', 'Indie']
# Take the top 20 genres and look at their evolution through time 
romance_genres = ['Drama',
 'Comedy',
 'Romance Film',
 'Black-and-white',
 'Action',
 'Thriller',
 'Short Film',
 'World cinema',
 'Crime Fiction',
 'Indie',
 'Documentary',
 'Horror',
 'Silent film',
 'Adventure',
 'Family Film',
 'Action/Adventure',
 'Comedy film',
 'Musical',
 'Animation',
 'Romantic drama']

def is_romantic(i): return lambda x: any(
    y in romance_genres[i] for y in x) if type(x) == list else False

romance_movies = movie_df[movie_df['Genres'].apply(is_romantic(slice(0, 5)))]
romance_movies = romance_movies[~romance_movies['Release date'].isna()]

# For romance movies, plotly the proportion of romantic genres per decade with a slider
# Create a dictionary with the number of movies per decade per genre
genre_counts = {}
for year in range(1890, 2021, 10):
    genre_counts[year] = {}
    for genre in romance_genres:
        genre_counts[year][genre] = romance_movies[romance_movies['Release date'].apply(lambda x: year <= int(str(x)[:4]) < (year+10)) & romance_movies['Genres'].apply(lambda x: (genre in x))]

genre_counts_prop = {}
for year in range(1890, 2020, 10):
    genre_counts_prop[year] = {}
    for genre in genre_counts[year].keys():
        genre_counts_prop[year][genre] = len(genre_counts[year][genre]) 
    

# Create a dataframe with the proportion of movies per year per genre
genre_counts_prop_df = pd.DataFrame(genre_counts_prop)
genre_counts_prop_df = genre_counts_prop_df.reset_index()
genre_counts_prop_df = genre_counts_prop_df.rename(columns={'index': 'Genre'})
genre_counts_prop_df = genre_counts_prop_df.melt(id_vars=['Genre'], var_name='Decade', value_name='Number of movies')
genre_counts_prop_df['Proportion of movies'] = genre_counts_prop_df.apply(lambda x: x['Number of movies'] / movie_df['Release date decade'].value_counts()[x['Decade']], axis=1)
genre_counts_prop_df['Proportion of movies in percentage'] = genre_counts_prop_df['Proportion of movies'].apply(lambda x: round(x*100, 2))
genre_counts_prop_df['Number of movies'] = genre_counts_prop_df['Number of movies'].apply(lambda x: "Nbr movies: " + str(x))

In [None]:
import plotly.express as px

fig = px.bar(genre_counts_prop_df, x="Genre", y="Proportion of movies in percentage", animation_frame="Decade", animation_group="Genre", color="Genre", hover_name="Number of movies", range_y=[0, 80])

fig["layout"].pop("updatemenus") # optional, drop animation buttons
fig.show()
fig.update_layout(showlegend=False)
# center the title

fig.update_layout(
    title="Distribution of movies genres across time",
    yaxis_title="Proportion of movies genre by decade in percentage",
    xaxis_title=""
    )
    # control the speed of the animation
fig.update_layout(
    updatemenus=[
        dict(
            type="buttons",
            buttons=[
                dict(
                    args=[None, {"frame": {"duration": 500, "redraw": False},
                                    "fromcurrent": True, "transition": {"duration": 400, "easing": "quadratic-in-out"}}],
                    label="Play",
                    method="animate"
                ),
                dict(
                    args=[[None], {"frame": {"duration": 0, "redraw": False},
                                    "mode": "immediate",    
                                    "transition": {"duration": 0}}],
                    label="Pause",
                    method="animate"
                )
            ]
        )
    ]
)
fig.update_layout(
    title={
        'text': "Distribution of movies genres across time",
        'y':0.98,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})


fig.write_html("genre_distrib.html")

### Second graph

In [None]:
PATH = 'final_df.csv'
# Load final_df from csv file
df = pd.read_csv(PATH, sep='\t')
df['Cluster'] = df['labels']

In [None]:
titles = ['Decision-makers', 'Heroes & anti-heroes', 'Femme fatale', 'Cunning evil', 'Clumsy', 'Virtuous', 'Righteous warrior', 'Benevolent leader', 'Wise mentor', 'Captain', 'Ingenuous', 'Tycoon', 'Ruthless commander', 'Arrogant leader', 'Love interest', 'Reconciliator', 'Adventurous woman', 'Apprentice', 'Young lover', 'Logistician', 'Lawyer', 'Stubborn fool', 'Eccentric villain', 'Marksman', 'Goofy friend', 'Hardworking learner', 'Benevolent', 'Sophisticated psycopath', 'Nemesis', 'Corrupt', 'Good cop', 'Musician', 'Protector', 'Family-oriented']
df['Cluster'] = df['labels'].apply(lambda x: titles[x-1])

In [None]:
# Find wikipedia id that are both in df and movie_df
df['Wikipedia ID'] = df['Wikipedia ID'].astype(int)
movie_df['Wikipedia ID'] = movie_df['Wikipedia ID'].astype(int)

In [None]:
# Get relaase date by decade
movie_df = movie_df[~movie_df['Release date'].isna()]
movie_df['Release date year'] = movie_df['Release date'].apply(lambda x: int(str(x)[:4]))
movie_df['Release date decade'] = movie_df['Release date year'].apply(lambda x: int(x/10)*10)
movie_df['Release date decade'][movie_df['Release date year'] < 1970] = 1960

In [None]:
new_df = df.merge(movie_df, on='Wikipedia ID')

In [None]:
# Find most common Cluster in new_df
most_common_clusters = new_df.groupby(new_df['labels']).size().sort_values(ascending=False).head(10).index
most_common_clusters_name = [titles[i-1] for i in most_common_clusters]
most_common_clusters_name

In [None]:
def make_cluster(df, i):
    cluster = df[df['labels'] == i]
    nbr_char_cluster = cluster.groupby(cluster['Release date decade']).size()
    nbr_char_by_decade = df.groupby(df['Release date decade']).size()
    cluster['Number of characters in cluster'] = cluster['Release date decade'].apply(lambda x: nbr_char_cluster[x] if nbr_char_cluster[x] else 0) 
    total_revenue_by_decade = df.groupby(df['Release date decade'])['Box office revenue_y'].sum()
    cluster['Ratio of characters in cluster by decade'] = cluster['Release date decade'].apply(lambda x: (1 / nbr_char_by_decade[x]) if nbr_char_by_decade[x] else 0)
    rev_cluster = cluster.groupby(cluster['Release date decade'])['Box office revenue_y'].sum()
    cluster['Revenue by decade'] = cluster['Release date decade'].apply(lambda x: rev_cluster[x] if rev_cluster[x] else 0)
    cluster['Ratio of revenue in cluster by decade'] = cluster['Release date decade'].apply(lambda x: round((rev_cluster[x]/ total_revenue_by_decade[x]), 2) if total_revenue_by_decade[x] else 0)
    cluster['Box office revenue_y'] = cluster['Box office revenue_y'].fillna('Not available')
    cluster['Box office revenue_x'] = cluster['Box office revenue_x'].fillna('Not available')
    cluster['Actor age at release'] = cluster['Actor age at release'].fillna('Not available')
    return cluster


In [None]:
import plotly.graph_objects as go

import pandas as pd

# Initialize figure
fig = go.Figure()

# Generate cluster for 10 most common clusters
for i in most_common_clusters:
    cluster = make_cluster(new_df, i)
    fig.add_trace(go.Bar(x=cluster["Release date decade"], 
                        y=cluster["Ratio of characters in cluster by decade"] * 100,
                        marker=dict(
                            color=(cluster['Ratio of revenue in cluster by decade'] * 100), #set color equal to a variable
                            colorscale='viridis', # other plotly colorscales
                            showscale=True),
                        hovertext=cluster["Character name"],
                        hovertemplate = 
                        "<b>%{hovertext}</b><br><br>" + 
                        "Movie title: %{customdata[29]}<br>" +
                        "Release date: %{customdata[37]}<br>" + 
                        "Actor age at release: %{customdata[20]}<br>" + 
                        "Box office revenue: %{customdata[31]:.2s}<br>",
                        customdata=cluster.values.tolist(),
                        name=titles[i-1]))

 
fig.update_layout(
    updatemenus=[
        dict(
            active=0,
            buttons=list([
                dict(label=most_common_clusters_name[0],
                     method="update",
                     args=[{"visible": [True, False, False, False, False, False, False, False, False, False]},
                           {"title": most_common_clusters_name[0]}]),
                dict(label=most_common_clusters_name[1],
                     method="update",
                     args=[{"visible": [False, True, False, False, False, False, False, False, False, False]},
                           {"title": most_common_clusters_name[1]}]),
                dict(label=most_common_clusters_name[2],
                        method="update",
                        args=[{"visible": [False, False, True, False, False, False, False, False, False, False]},
                            {"title": most_common_clusters_name[2]}]),
                dict(label=most_common_clusters_name[3],
                        method="update",
                        args=[{"visible": [False, False, False, True, False, False, False, False, False, False]},
                            {"title": most_common_clusters_name[3]}]),
                dict(label=most_common_clusters_name[4],
                        method="update",
                        args=[{"visible": [False, False, False, False, True, False, False, False, False, False]},
                            {"title": most_common_clusters_name[4]}]),
                dict(label=most_common_clusters_name[5],
                        method="update",
                        args=[{"visible": [False, False, False, False, False, True, False, False, False, False]},
                            {"title": most_common_clusters_name[5]}]),
                dict(label=most_common_clusters_name[6],
                        method="update",
                        args=[{"visible": [False, False, False, False, False, False, True, False, False, False]},
                            {"title": most_common_clusters_name[6]}]),
                dict(label=most_common_clusters_name[7],
                        method="update",
                        args=[{"visible": [False, False, False, False, False, False, False, True, False, False]},
                            {"title": most_common_clusters_name[7]}]),
                dict(label=most_common_clusters_name[8],
                        method="update",
                        args=[{"visible": [False, False, False, False, False, False, False, False, True, False]},
                            {"title": most_common_clusters_name[8]}]),
                dict(label=most_common_clusters_name[9],
                        method="update",
                        args=[{"visible": [False, False, False, False, False, False, False, False, False, True]},
                            {"title": most_common_clusters_name[9]}]),

            ]),
        )
    ])

fig.update_layout(
    title_text="Characters cluster by decade",
    xaxis_title="Decade",
    yaxis_title="Ratio of characters in cluster by decade",
)
# Modify x labels
fig.update_xaxes(tickvals=[1960, 1970, 1980, 1990, 2000, 2010], ticktext=['Up to 1960s', '1970s', '1980s', '1990s', '2000s', '2010s'])

# fig.update_traces(marker_colorbar=dict(
#     title="Ratio of revenue in cluster by decade",
#     titleside="top",
#     tickmode="array",
#     tickvals=[0, 5, 10, 20, 40, 25, 20],
#     ticktext=['10%', '20%', '40%', '60%', '80%', '100%'],
# )
# )

# Set all traces to invisible
fig.update_traces(visible=False)
# Set first trace to visible
fig.data[0].visible = True


# Set title
fig.update_layout(title_text="Characters cluster by decade")
fig.show()
#save as html
fig.write_html("cluster.html")