In [73]:
import os
import pickle
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.offline as offline
import plotly.graph_objs as go
from Cluster_analysis import *
pd.options.mode.chained_assignment = None


Hovertemplate = "<b>%{hovertext}</b><br><br>" + "Movie: %{customdata[0]}<br>" + "Release date: %{customdata[1]}<br>" + "Actor name: %{customdata[2]}<br>" + "Actor age at release: %{customdata[3]}<br>" + "Gender: %{customdata[4]}<br>" + "Character archetype: %{customdata[6]}<br>" + "Description: %{customdata[7]}<br>" + "Genres: %{customdata[8]}<br>" + "Box office revenue: %{customdata[9]}<br>"



## Loading data 

In [74]:
PATH = 'Data/final_df_test.csv'
# Load final_df from csv file
df = pd.read_csv(PATH, sep='\t')
# Fill missing values with 'Not Available' for all columns except 'Partner'
df = df.fillna('Not Available')
# Replace 'Not Available' with 'No partner' for 'Partner' column
df['partner'] = df['partner'].replace('Not Available', 'No partner')
# Convert the elements of the partner column that are not 'No partner' to a list
df['partner'] = df['partner'].apply(
    lambda x: x[1:-1].split(',') if x != 'No partner' else x)
# Remove the square brackets from the elements of the partner column that are not 'No partner', also remove the spaces and the single quotes
df['partner'] = df['partner'].apply(
    lambda x: [y.strip(' ').strip('\'') for y in x] if x != 'No partner' else x)

# Make a list of 48 random strings
titles = [''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), 10))
                    for i in range(48)]

# computes centroid of each cluster
centroids = compute_centroids(df, 'labels')


## Full graph

In [None]:
store = False
# Plotly 3D scatter plot
fig = make_full_graph(df)
fig = add_centroids(fig, centroids, titles)
fig = set_layout(fig, df, Hovertemplate)
fig.show()
if store:
    fig.write_html("Plots/Full_plot.html")

## Romance vs Non-romance & Gender

In [None]:
store = True
fig_romgen = go.Figure()
# Make df_romance and df_non_romance
df_male_rom = df[df['Gender'] == 'M']
df_male_rom = df_male_rom[df_male_rom['romance'] == True]
df_female_rom = df[df['Gender']=='F']
df_female_rom = df_female_rom[df_female_rom['romance'] == True]
df_unknown_rom = df[df['Gender'] == 'Not Available']
df_unknown_rom = df_unknown_rom[df_unknown_rom['romance'] == True]
df_male_nonrom = df[df['Gender'] == 'M']
df_male_nonrom = df_male_nonrom[df_male_nonrom['romance'] == False]
df_female_nonrom = df[df['Gender'] == 'F']
df_female_nonrom = df_female_nonrom[df_female_nonrom['romance'] == False]
df_unknown_nonrom = df[df['Gender'] == 'Not Available']
df_unknown_nonrom = df_unknown_nonrom[df_unknown_nonrom['romance'] == False]

# Make a list of the dataframes
df_list = [df_male_rom, df_female_rom, df_unknown_rom, df_male_nonrom, df_female_nonrom, df_unknown_nonrom]
for df_item in df_list:
    fig = set_layout(make_full_graph(df_item), df_item, Hovertemplate)
    fig = add_centroids(fig, centroids, titles)
    fig_romgen.add_trace(fig.data[0])

fig_romgen = set_layout(fig_romgen, df, Hovertemplate)
fig_romgen = add_centroids(fig_romgen, centroids, titles)
num_traces = len(fig_romgen.data)
num_romgen_comb = 6


boolean_male_rom = [True if i == 0 or i > num_romgen_comb + 1 else False for i in range(num_traces)]
boolean_female_rom = [True if i == 1 or i >
                      num_romgen_comb + 1 else False for i in range(num_traces)]
boolean_male_nonrom = [True if i == 3 or i >
                       num_romgen_comb + 1 else False for i in range(num_traces)]
boolean_female_nonrom = [True if i == 4 or i >
                         num_romgen_comb + 1 else False for i in range(num_traces)]
boolean_romance = [True if i < 3 or i > num_romgen_comb +
                   1 else False for i in range(num_traces)]
boolean_nonromance = [True if i >= 3 or i >
                      num_romgen_comb + 1 else False for i in range(num_traces)]
boolean_all = [True for i in range(num_traces)]


fig_romgen.update_layout(
    updatemenus=[
        dict(
            type="buttons",
            direction="left",
            # Buttons should appear above the plot
            x=0.1,
            y=1.12,
            # text color should be black and buttons should be white
                font=dict(color="black"),
                bgcolor="grey",
            xanchor = 'left',
            yanchor = 'top',
            buttons=list([
                dict(args=[{"visible": boolean_romance}],
                        label="All romance",
                        method="update"),
                dict(args=[{"visible": boolean_nonromance}],
                        label='All non romance',
                        method="update"),
                dict(args=[{"visible": boolean_male_rom}],
                        label="Romantic males",
                        method="update"),
                dict(args=[{"visible": boolean_female_rom}],
                        label='Romantic females',
                        method="update"),
                dict(args=[{"visible": boolean_male_nonrom}],
                        label='Non-romantic males',
                        method="update"),
                dict(args=[{"visible": boolean_female_nonrom}],
                        label='Non-romantic females',
                        method="update"),

            ])
        )
    ]
)

# Graph is initially set to show all romance, so set the traces like that
for i in range(num_traces):
        fig_romgen.data[i].visible = boolean_romance[i]


fig_romgen.show()
if store:
    fig_romgen.write_html("Plots/Romancegen_plot.html")


## Relations

In [None]:
def add_relation(fig_total, df): 
    fig = make_full_graph(df)
    fig = set_layout(fig, df, Hovertemplate)
    fig_total.add_trace(fig.data[0])
    num_traces_added = 1
    for i in range(len(df)):
        if df['partner'][i] != 'No partner':
            for j in range(len(df['partner'][i])):
                # Get the index of the character that the character is in a relationship with
                index = df.index[df['Freebase character ID'] == df['partner'][i][j]].tolist()[
                    0]
                # Add a line between the two characters, the line should have 20% opacity
                fig_total.add_trace(go.Scatter3d(x=[df['X'][i], df['X'][index]], y=[df['Y'][i], df['Y'][index]], z=[
                                    df['Z'][i], df['Z'][index]], mode='lines', hoverinfo='none', line=dict(color='white', width=1), opacity=0.4))
                num_traces_added += 1
    return fig_total, num_traces_added



In [None]:
store=True

# Dataframes
df_romance = df[df['romance'] == True]
df_romance = df_romance.reset_index(drop=True)
df_nonromance = df[df['romance'] == False]
df_nonromance = df_nonromance.reset_index(drop=True)

# Adding traces
fig_relations = go.Figure()
fig_relations, num_traces_romance = add_relation(fig_relations, df_romance)
fig_relations, num_traces_nonromance = add_relation(fig_relations, df_nonromance)
fig_relations = add_centroids(fig_relations, centroids, titles)
fig_relations = set_layout(fig_relations, df, Hovertemplate)
num_centroids = len(centroids)
num_traces_total = num_traces_romance + num_traces_nonromance + num_centroids

# Adding buttons
boolean_all = [True for i in range(num_traces_total)]
boolean_romance = [True if i < num_traces_romance or i > num_traces_romance + num_traces_nonromance -1 else False for i in range(num_traces_total)]
boolean_nonromance = [True if i >= num_traces_romance else False for i in range(num_traces_total)]

fig_relations.update_layout(
    updatemenus=[
        dict(
            type="buttons",
            direction="left",
            # Buttons should appear above the plot
            x=0.3,
            y=1.12,
            # text color should be black and buttons should be white
                font=dict(color="black"),
                bgcolor="grey",
            # button should be centered and at the top of the plot
            xanchor = 'left',
            yanchor = 'top',
            
            buttons=list([
                dict(args=[{"visible": boolean_all}],
                        label='All movies',
                        method="update"),
                dict(args=[{"visible": boolean_romance}],
                        label="Romantic movies",
                        method="update"),
                dict(args=[{"visible": boolean_nonromance}],
                        label='Non-romantic movies',
                        method="update")
            ])
        )
    ]
)

# Graph is initially set to all movies, so set the traces like that
for i in range(num_traces_total):
        fig_relations.data[i].visible = boolean_all[i]

fig_relations.show()
                
if store:
    fig_relations.write_html("Plots/Relations_plot.html")




## Slider for dates

In [75]:
store = True
df_list = split_by_date(df)

#Create a plotly graph with 5 traces (one for each time period)
fig_time = go.Figure()

for i in range(5):
    fig = set_layout(make_full_graph(df_list[i]), df_list[i], Hovertemplate)
    fig_time.add_trace(fig.data[0])

# set layout fig_time
fig_time = set_layout(fig_time, df, Hovertemplate)
fig_time = add_centroids(fig_time, centroids, titles)
num_traces = len(centroids) + len(df_list)
steps = []
for i in range(5):
    step = dict(
        method="restyle",
        args=["visible", [False] * num_traces],
        label = 'Time period ' + str(i+1)
    )
    step["args"][1][i] = True # Toggle i'th trace to "visible"
    for j in range(len(df_list)-1, num_traces):
        step["args"][1][j] = True
    steps.append(step)

sliders = [dict(
    active=0,
    currentvalue={"prefix": "Time period: "},
    # Set text color for the slider to white
    font={"color": "white"},
    pad={"t": 50},
    steps=steps,
    
)]

# At the beginning, the first trace should be visible
fig_time.data[0].visible = True
fig_time.data[1].visible = False
fig_time.data[2].visible = False
fig_time.data[3].visible = False
fig_time.data[4].visible = False

fig_time.update_layout(
    sliders=sliders,    
)

fig_time.show()

if store:
    fig.write_html("Plots/plot_time.html")


Size of df_1:  168
Size of df_2:  412
Size of df_3:  1067
Size of df_4:  1491
Size of df_5:  4466
Size of df_6:  8148


In [None]:
# # For drawing clusters farther apart

# df['X'] = df['X'] + df['labels'] * 0.2
# df['Y'] = df['Y'] + df['labels'] * 0.2
# df['Z'] = df['Z'] + df['labels'] * 0.2

# # Bring the points in cluster 1 closer to the centroid of cluster 1, leave the other points unchanged
# for i in range(len(df)):
#     if df['labels'][i] == 1:
#         df['X'][i] = df['X'][i] - 0.2 * (df['X'][i] - centroids[1][0])
#         df['Y'][i] = df['Y'][i] - 0.2 * (df['Y'][i] - centroids[1][1])
#         df['Z'][i] = df['Z'][i] - 0.2 * (df['Z'][i] - centroids[1][2])
