### GENERAL

In [8]:
# User plotly to visualize something

import plotly.express as px
import plotly.io as pio
import pandas as pd

# Load all csv files in csv/
import glob
import os

csv_files = glob.glob("csv/*.csv")
# Convert all entries to structured data with attribute 0 as frame number, 1 as Face Box, and 5 as Identity
# Skip first two lines of each file
# Skip rows where Identity is N/A

USE_GENERAL_NAMES = False
USE_GENDER_NAMES = False
IGNORE_MINOR_CHARACTERS = True

FILM_FRAME_TOTALS = {
	"Cliff": 2097,
	"Our Village": 2377,
	"Diary of Nurse": 2297,
	"Sabrina": 2751,
	"The Man from Laramie": 2477,
	"Human Desire": 2200,
	"Chikamatsu Story": 2460,
	"Black River": 2552,
	"Duel in the Sun": 3207,
	"Gilda": 2652,
	"Jigoku Mon": 2151,
	"Kurutta Kajitsu": 2076,
	"Notorious": 2433,
	"Oklahoma": 3577,
	"Vikings": 2740,
	"Waves of Life": 2465
}

RIVALS = ["Keming", "Professor Yuan", "Haoru", "David", "Vic", "Carl", "Ishun", "Hitokiri Joe", "Natsuhisa", "Bo Kang", "Wataru", "Jud", "Alex Sebastian", "Ballin", "Einar", "Jesse"]
MALES = []#["Zhanwu", "Fan Jun", "Changping", "Linus", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Devlin", "Johnny", "Eric", "Lewt"]
LADIES = []#["Shuzhen", "Fang Qing", "Suhua", "Sabrina", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Alicia", "Gilda", "Morgana", "Pearl"]

PROTAGONISTS = []#["Zhanwu", "Fang Qing", "Suhua", "Sabrina", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Alicia", "Johnny", "Eric", "Pearl"]
LOVES = []#["Shuzhen", "Fan Jun", "Changping", "Linus", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Devlin", "Gilda", "Morgana", "Lewt"]

def get_name(name):
	if USE_GENDER_NAMES:
		if name in MALES:
			name = "Male"
		elif name in LADIES:
			name = "Female"
	else:
		if name in PROTAGONISTS:
			name = "Protagonist"
		elif name in LOVES:
			name = "Love"
	return name

data = []
for file in csv_files:
	with open(file) as f:
		lines = f.readlines()[2:]
		for line in lines:
			line = line.split(",")
			if line[5] != "\"N/A\"":
				# Att 3 is frame numbder / film's total frames (the film is the name of the current csv file)
				total = FILM_FRAME_TOTALS[os.path.basename(file)[:-4]]
				frame = int(line[0][1:][:-1])
				name = line[5][1:][:-1]
				if USE_GENERAL_NAMES:
					if name in RIVALS:
						name = "Rival"
					name = get_name(name)
				if IGNORE_MINOR_CHARACTERS:
					if USE_GENERAL_NAMES:
						if name not in ["Rival", "Protagonist", "Female", "Male", "Love"]:
							continue
					else:
						if name not in RIVALS:
							if name not in PROTAGONISTS:
								if name not in LOVES:
									continue
				data.append([frame, line[1], name.replace(" ", "_"), frame / total * 9])

df = pd.DataFrame(data, columns=["Frame", "Face Box", "Identity", "Timeline"])
df["Identity"] = df["Identity"].astype(str)
df["Identity"] = df["Identity"].apply(lambda x: x.split(" ")[0])

# Create a scatter plot of the data
fig = px.scatter(df, height=640, x="Timeline", y="Identity", color="Identity", title="Face Recognition Results in All Films", color_discrete_sequence=px.colors.qualitative.Vivid)
fig.update_layout(
    plot_bgcolor='rgb(240, 240, 230)',
	font=dict(family="Georgia", size=14),
    title_font=dict(size=22),
    legend_title_font=dict(size=16),
	margin=dict(t=50, b=40, l=40, r=40), height=560
)
fig.update_xaxes(showgrid=True, gridcolor='rgb(200, 190, 150)', zeroline=False)
fig.update_yaxes(showgrid=True, gridcolor='rgb(200, 190, 150)', zeroline=False)
fig.show()



### CHUNKED PRESENCE

Only use for one film. This is an older implement of the averaging method and does not count chunk totals as expected.

In [28]:
CHUNKING = 9

# Only use this for one film
# User plotly to visualize something

import plotly.express as px
import pandas as pd

# Load all csv files in csv/
import glob
import os

film = "Diary of Nurse"
csv_files = glob.glob(f"csv/{film}.csv")
# Convert all entries to structured data with attribute 0 as frame number, 1 as Face Box, and 5 as Identity
# Skip first two lines of each file
# Skip rows where Identity is N/A

USE_GENERAL_NAMES = False
USE_GENDER_NAMES = False

USE_AVERAGE = False

IGNORE_MINOR_CHARACTERS = True

FILM_FRAME_TOTALS = {
	"Cliff": 2097,
	"Our Village": 2377,
	"Diary of Nurse": 2297,
	"Sabrina": 2751,
	"The Man from Laramie": 2477,
	"Human Desire": 2200,
	"Chikamatsu Story": 2460,
	"Black River": 2552,
	"Duel in the Sun": 3207,
	"Gilda": 2652,
	"Jigoku Mon": 2151,
	"Kurutta Kajitsu": 2076,
	"Notorious": 2433,
	"Oklahoma": 3577,
	"Vikings": 2740,
	"Waves of Life": 2465
}

RIVALS = ["Keming", "Professor Yuan", "Haoru", "David", "Vic", "Carl", "Ishun", "Hitokiri Joe", "Natsuhisa", "Bo Kang", "Wataru", "Jud", "Alex Sebastian", "Ballin", "Einar", "Jesse"]
MALES = ["Zhanwu", "Fan Jun", "Changping", "Linus", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Devlin", "Johnny", "Eric", "Lewt"]
LADIES = ["Shuzhen", "Fang Qing", "Suhua", "Sabrina", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Alicia", "Gilda", "Morgana", "Pearl"]

PROTAGONISTS = ["Zhanwu", "Fang Qing", "Suhua", "Sabrina", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Alicia", "Johnny", "Eric", "Pearl"]
LOVES = ["Shuzhen", "Fan Jun", "Changping", "Linus", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Devlin", "Gilda", "Morgana", "Lewt"]

def find_section(range_start, range_end, num_sections, value):
    """
    Given a range [range_start, range_end], this function splits it into
    num_sections and returns which section (1-based index) the value falls into.

    :param range_start: The start of the range (inclusive).
    :param range_end: The end of the range (inclusive).
    :param num_sections: Number of sections to divide the range into.
    :param value: The value for which we want to determine the section index.
    :return: The section index (1-based) where the value fits.
    """
    # Handle edge cases, e.g., if value < range_start or value > range_end
    if value <= range_start:
        return 1
    if value >= range_end:
        return num_sections

    # Compute the size of each section
    section_size = (range_end - range_start) / num_sections

    # Determine which section the value falls into using integer division
    section_index = int((value - range_start) // section_size) + 1

    # Make sure the section index doesn't exceed num_sections
    if section_index > num_sections:
        section_index = num_sections

    return section_index

def get_name(name):
	if USE_GENDER_NAMES:
		if name in MALES:
			name = "Male"
		elif name in LADIES:
			name = "Female"
	else:
		if name in PROTAGONISTS:
			name = "Protagonist"
		elif name in LOVES:
			name = "Love"
	return name

data = []
chunks = []
chunk_totals = []
name_to_film = {}
for i in range(CHUNKING + 1):
	chunk_totals.append(0)
for i in range(CHUNKING + 1):
	chunks.append({})

all_names = []

for file in csv_files:
	ft = file[4:-4]
	with open(file) as f:
		lines = f.readlines()[2:]
		for line in lines:
			line = line.split(",")
			if line[5] != "\"N/A\"":
				# Att 3 is frame numbder / film's total frames (the film is the name of the current csv file)
				total = FILM_FRAME_TOTALS[os.path.basename(file)[:-4]]
				frame = int(line[0][1:][:-1])

				name = line[5][1:][:-1]
				if USE_GENERAL_NAMES:
					if name in RIVALS:
						name = "Rival"
					name = get_name(name)
				if IGNORE_MINOR_CHARACTERS:
					if USE_GENERAL_NAMES:
						if name not in ["Rival", "Protagonist", "Female", "Male", "Love"]:
							continue
					else:
						if name not in RIVALS:
							if name not in PROTAGONISTS:
								if name not in LOVES:
									continue
				time = find_section(0, total, CHUNKING, frame)
				if name not in all_names:
					all_names.append(name)
				# the chunk array time participant is a dictionary of the participants in that chunk
				chunk_totals[time] += 1
				if name not in chunks[time]:
					chunks[time][name] = 0
				chunks[time][name] += 1

				if name not in name_to_film:
					name_to_film[name] = ft
				#data.append([frame, line[1], name, time])

for n in all_names:
	for i in range(1, CHUNKING+1):
		if n not in chunks[i]:
			chunks[i][n] = 0
			
for i in range(CHUNKING+1):
	for name in chunks[i]:
		gen = ""
		if name in RIVALS:
			gen = "Rival"
		else:
			gen = get_name(name)
		if abs(chunks[i][name]) < 0.00000001:
			data.append([i, name, 0, gen, name_to_film[name]])
		else:
			if USE_AVERAGE:
				data.append([i, name, chunks[i][name] / chunk_totals[i], gen, name_to_film[name]])
			else:
				data.append([i, name, chunks[i][name], gen, name_to_film[name]])

df = pd.DataFrame(data, columns=["Timeline", "Identity", "Value", "Role", "Title"])
df["Identity"] = df["Identity"].astype(str)
df["Identity"] = df["Identity"].apply(lambda x: x.split(" ")[0])

# Create a scatter plot of the data
fig = px.line(df, x="Timeline", y="Value", color="Identity", title=f"Chunked Average Temporal Presence in <i>{film}</i>", markers=True, color_discrete_sequence=px.colors.qualitative.Vivid)
fig.update_layout(
    plot_bgcolor='rgb(240, 240, 230)',
	font=dict(family="Georgia", size=14),
    title_font=dict(size=22),
    legend_title_font=dict(size=16),
	margin=dict(t=50, b=40, l=40, r=40), height=460
)
df.to_csv("All_Chunked_Presence.csv")
fig.update_xaxes(showgrid=True, gridcolor='rgb(200, 190, 150)', zeroline=False)
fig.update_yaxes(showgrid=True, gridcolor='rgb(200, 190, 150)', zeroline=False)
fig.show()



### CHUNKED FACE BOX SIZE (AVERAGE OR TOTAL)
[DEPRECATED]

In [16]:
CHUNKING = 9

# User plotly to visualize something

import plotly.express as px
import pandas as pd

# Load all csv files in csv/
import glob
import os

csv_files = glob.glob("csv/*.csv")
# Convert all entries to structured data with attribute 0 as frame number, 1 as Face Box, and 5 as Identity
# Skip first two lines of each file
# Skip rows where Identity is N/A

USE_AVERAGE = True

USE_GENERAL_NAMES = True
USE_GENDER_NAMES = False
IGNORE_MINOR_CHARACTERS = True

FILM_FRAME_TOTALS = {
	"Cliff": 2097,
	"Our Village": 2377,
	"Diary of Nurse": 2297,
	"Sabrina": 2751,
	"The Man from Laramie": 2477,
	"Human Desire": 2200,
	"Chikamatsu Story": 2460,
	"Black River": 2552,
	"Duel in the Sun": 3207,
	"Gilda": 2652,
	"Jigoku Mon": 2151,
	"Kurutta Kajitsu": 2076,
	"Notorious": 2433,
	"Oklahoma": 3577,
	"Vikings": 2740,
	"Waves of Life": 2465
}

RIVALS = ["Keming", "Professor Yuan", "Haoru", "David", "Vic", "Carl", "Ishun", "Hitokiri Joe", "Natsuhisa", "Bo Kang", "Wataru", "Jud", "Alex Sebastian", "Ballin", "Einar", "Jesse"]
MALES = ["Zhanwu", "Fan Jun", "Changping", "Linus", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Devlin", "Johnny", "Eric", "Lewt"]
LADIES = ["Shuzhen", "Fang Qing", "Suhua", "Sabrina", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Alicia", "Gilda", "Morgana", "Pearl"]

PROTAGONISTS = ["Zhanwu", "Fang Qing", "Suhua", "Sabrina", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Alicia", "Johnny", "Eric", "Pearl"]
LOVES = ["Shuzhen", "Fan Jun", "Changping", "Linus", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Devlin", "Gilda", "Morgana", "Lewt"]

def find_section(range_start, range_end, num_sections, value):
    """
    Given a range [range_start, range_end], this function splits it into
    num_sections and returns which section (1-based index) the value falls into.

    :param range_start: The start of the range (inclusive).
    :param range_end: The end of the range (inclusive).
    :param num_sections: Number of sections to divide the range into.
    :param value: The value for which we want to determine the section index.
    :return: The section index (1-based) where the value fits.
    """
    # Handle edge cases, e.g., if value < range_start or value > range_end
    if value <= range_start:
        return 1
    if value >= range_end:
        return num_sections

    # Compute the size of each section
    section_size = (range_end - range_start) / num_sections

    # Determine which section the value falls into using integer division
    section_index = int((value - range_start) // section_size) + 1

    # Make sure the section index doesn't exceed num_sections
    if section_index > num_sections:
        section_index = num_sections

    return section_index

def get_name(name):
	if USE_GENDER_NAMES:
		if name in MALES:
			name = "Male"
		elif name in LADIES:
			name = "Female"
	else:
		if name in PROTAGONISTS:
			name = "Protagonist"
		elif name in LOVES:
			name = "Love"
	return name

data = []
chunks = []
chunkcounts = []
for i in range(CHUNKING + 1):
	chunks.append({})
	chunkcounts.append({})

for file in csv_files:
	with open(file) as f:
		lines = f.readlines()[2:]
		for line in lines:
			line = line.split(",")
			if line[5] != "\"N/A\"":
				# Att 3 is frame numbder / film's total frames (the film is the name of the current csv file)
				total = FILM_FRAME_TOTALS[os.path.basename(file)[:-4]]
				frame = int(line[0][1:][:-1])
				name = line[5][1:][:-1]
				face_box_text = line[1][1:][:-1][1:][:-1]
				face_box = [float(x) for x in face_box_text.split("\， ")]
				face_box_height = face_box[3]# - face_box[1]
				face_box_width = face_box[2]# - face_box[0]
				if USE_GENERAL_NAMES:
					if name in RIVALS:
						name = "Rival"
					name = get_name(name)
				if IGNORE_MINOR_CHARACTERS:
					if USE_GENERAL_NAMES:
						if name not in ["Rival", "Protagonist", "Female", "Male", "Love"]:
							continue
					else:
						if name not in RIVALS:
							if name not in PROTAGONISTS:
								if name not in LOVES:
									continue
				time = find_section(0, total, CHUNKING, frame)
				# the chunk array time participant is a dictionary of the participants in that chunk
				if name not in chunks[time]:
					chunks[time][name] = 0
					chunkcounts[time][name] = 0
				chunks[time][name] += face_box_height * face_box_width
				chunkcounts[time][name] += 1
				#data.append([frame, line[1], name, time])

for i in range(CHUNKING + 1):
	for name in chunks[i]:
		if USE_AVERAGE:
			chunks[i][name] /= chunkcounts[i][name]
		data.append([i, name, chunks[i][name]])

df = pd.DataFrame(data, columns=["Timeline", "Identity", "Value"])
df["Identity"] = df["Identity"].astype(str)
df["Identity"] = df["Identity"].apply(lambda x: x.split(" ")[0])

# Create a scatter plot of the data
fig = px.line(df, x="Timeline", y="Value", color="Identity", title="Face Recognition")
fig.show()

### ACTION UNITS
[DEPRECATED]

In [17]:
# User plotly to visualize something

AU = 2
AU_R = 1
AU_E = 18

E_TH = 0.5

import plotly.express as px
import pandas as pd

# Load all csv files in csv/
import glob
import os

csv_files = glob.glob("csv/*.csv")
# Convert all entries to structured data with attribute 0 as frame number, 1 as Face Box, and 5 as Identity
# Skip first two lines of each file
# Skip rows where Identity is N/A

USE_GENERAL_NAMES = True
USE_GENDER_NAMES = True
IGNORE_MINOR_CHARACTERS = True

FILM_FRAME_TOTALS = {
	"Cliff": 2097,
	"Our Village": 2377,
	"Diary of Nurse": 2297,
	"Sabrina": 2751,
	"The Man from Laramie": 2477,
	"Human Desire": 2200,
}

FILMS_WITH_EMOTION = FILM_FRAME_TOTALS.keys()

RIVALS = ["Keming", "Professor Yuan", "Haoru", "David", "Vic", "Carl"]
MALES = ["Zhanwu", "Fan Jun", "Changping", "Linus", "Will", "Jeff"]
LADIES = ["Shuzhen", "Fang Qing", "Suhua", "Sabrina", "Barbara", "Vicki"]

PROTAGONISTS = ["Zhanwu", "Fang Qing", "Suhua", "Sabrina", "Will", "Jeff"]
LOVES = ["Shuzhen", "Fan Jun", "Changping", "Linus", "Barbara", "Vicki"]

def get_name(name):
	if USE_GENDER_NAMES:
		if name in MALES:
			name = "Male"
		elif name in LADIES:
			name = "Female"
	else:
		if name in PROTAGONISTS:
			name = "Fan Jun"
		elif name in LOVES:
			name = "Love"
	return name

data = []
for file in csv_files:
	if file[4:-4] not in FILMS_WITH_EMOTION:
		continue
	with open(file) as f:
		emotion_file = "openface_cleaned/" + os.path.basename(file)[:-4] + ".csv"
		lines = f.readlines()[2:]
		for line in lines:
			line = line.split(",")
			if line[5] != "\"N/A\"":
				emotion_row = ""
				path = line[4][1:][:-1]
				with open(emotion_file) as ef:
					ef_lines = ef.readlines()
					for ef_line in ef_lines:
						ef_line = ef_line.split(",")
						if ef_line[0][:-4] == path[6:]:
							emotion_row = ef_line
							break
				if emotion_row == "":
					continue

				log1 = int(int(AU) + int(AU_R) + 1)
				log2 = int(int(AU) + int(AU_E) + 1)
				r = emotion_row[log1]
				e = float(emotion_row[log2])

				if e < E_TH:
					continue

				# Att 3 is frame numbder / film's total frames (the film is the name of the current csv file)
				total = FILM_FRAME_TOTALS[os.path.basename(file)[:-4]]
				frame = int(line[0][1:][:-1])
				name = line[5][1:][:-1]
				if USE_GENERAL_NAMES:
					if name in RIVALS:
						name = "Rival"
					name = get_name(name)
				if IGNORE_MINOR_CHARACTERS:
					if USE_GENERAL_NAMES:
						if name not in ["Rival", "Protagonist", "Female", "Male", "Love"]:
							continue
					else:
						if name not in RIVALS:
							if name not in PROTAGONISTS:
								if name not in LOVES:
									continue
				data.append([frame, r, name, frame / total, os.path.basename(file)[:-4]])

df = pd.DataFrame(data, columns=["Frame", "Face Box", "Identity", "Timeline", "Film"])
df["Identity"] = df["Identity"].astype(str)
df["Identity"] = df["Identity"].apply(lambda x: x.split(" ")[0])

# Create a scatter plot of the data
fig = px.scatter(df, x="Timeline", y="Face Box", color="Identity", hover_data=["Film"], title="Face Recognition")
fig.show()

In [18]:
CHUNKING = 7

AU = 0
AUS = ["01", "02", "04", "05", "06", "07", "09", "10", "12", "14", "15", "17", "20", "23", "25", "26", "28", "45"]
print(len(AUS))

ALL_AU = True # Use all AUs in the same graph
COUNTING = True # Count number or intensity

AU_R = 1
AU_E = 18

E_TH = 0.5 #  Presence threshold (0.5 for present and -0.5 for all)
R_TH = 3 # Itensity threshold (1-5)

# User plotly to visualize something

import plotly.express as px
import pandas as pd

# Json
import json

# Load all csv files in csv/
import glob
import os

csv_files = glob.glob("csv/*.csv")
# Convert all entries to structured data with attribute 0 as frame number, 1 as Face Box, and 5 as Identity
# Skip first two lines of each file
# Skip rows where Identity is N/A

USE_AVERAGE = True

USE_GENERAL_NAMES = True
USE_GENDER_NAMES = True
IGNORE_MINOR_CHARACTERS = True

FILM_FRAME_TOTALS = {
	"Cliff": 2097,
	"Our Village": 2377,
	"Diary of Nurse": 2297,
	"Sabrina": 2751,
	"The Man from Laramie": 2477,
	"Human Desire": 2200,
}

FILMS_WITH_EMOTION = FILM_FRAME_TOTALS.keys()

RIVALS = ["Keming", "Professor Yuan", "Haoru", "David", "Vic", "Carl"]
MALES = ["Zhanwu", "Fan Jun", "Changping", "Linus", "Will", "Jeff"]
LADIES = ["Shuzhen", "Fang Qing", "Suhua", "Sabrina", "Barbara", "Vicki"]

PROTAGONISTS = ["Zhanwu", "Fang Qing", "Suhua", "Sabrina", "Will", "Jeff"]
LOVES = ["Shuzhen", "Fan Jun", "Changping", "Linus", "Barbara", "Vicki"]

def find_section(range_start, range_end, num_sections, value):
    """
    Given a range [range_start, range_end], this function splits it into
    num_sections and returns which section (1-based index) the value falls into.

    :param range_start: The start of the range (inclusive).
    :param range_end: The end of the range (inclusive).
    :param num_sections: Number of sections to divide the range into.
    :param value: The value for which we want to determine the section index.
    :return: The section index (1-based) where the value fits.
    """
    # Handle edge cases, e.g., if value < range_start or value > range_end
    if value <= range_start:
        return 1
    if value >= range_end:
        return num_sections

    # Compute the size of each section
    section_size = (range_end - range_start) / num_sections

    # Determine which section the value falls into using integer division
    section_index = int((value - range_start) // section_size) + 1

    # Make sure the section index doesn't exceed num_sections
    if section_index > num_sections:
        section_index = num_sections

    return section_index

def get_name(name):
	if USE_GENDER_NAMES:
		if name in MALES:
			name = "Male"
		elif name in LADIES:
			name = "Female"
	else:
		if name in PROTAGONISTS:
			name = "Protagonist"
		elif name in LOVES:
			name = "Love"
	return name

data = []
chunks = []
chunkcounts = []
for i in range(CHUNKING + 1):
	chunks.append({})
	chunkcounts.append({})

for file in csv_files:
	if file[4:-4] not in FILMS_WITH_EMOTION:
		continue
	with open(file) as f:
		emotion_file = "openface_cleaned/" + os.path.basename(file)[:-4] + ".csv"
		lines = f.readlines()[2:]
		for line in lines:
			line = line.split(",")
			if line[5] != "\"N/A\"":
				# Att 3 is frame numbder / film's total frames (the film is the name of the current csv file)
				total = FILM_FRAME_TOTALS[os.path.basename(file)[:-4]]
				frame = int(line[0][1:][:-1])
				name = line[5][1:][:-1]
				path = line[4][1:][:-1]

				emotion_row = ""
				with open(emotion_file) as ef:
					ef_lines = ef.readlines()
					for ef_line in ef_lines:
						ef_line = ef_line.split(",")
						if ef_line[0][:-4] == path[6:]:
							emotion_row = ef_line
							break
				if emotion_row == "":
					continue

				log1 = int(int(AU) + int(AU_R) + 1)
				log2 = int(int(AU) + int(AU_E) + 1)


				r = float(emotion_row[log1])
				e = float(emotion_row[log2])


				face_box_text = line[1][1:][:-1][1:][:-1]
				face_box = [float(x) for x in face_box_text.split("\， ")]
				face_box_height = face_box[3]# - face_box[1]
				face_box_width = face_box[2]# - face_box[0]
				if USE_GENERAL_NAMES:
					if name in RIVALS:
						name = "Rival"
					name = get_name(name)
				if IGNORE_MINOR_CHARACTERS:
					if USE_GENERAL_NAMES:
						if name not in ["Rival", "Protagonist", "Female", "Male", "Love"]:
							continue
					else:
						if name not in RIVALS:
							if name not in PROTAGONISTS:
								if name not in LOVES:
									continue
				time = find_section(0, total, CHUNKING, frame)
				base_name = name
				if ALL_AU:
					name = ""
					for i in range(1, 18):
						r_val = float(emotion_row[i])
						e_val = float(emotion_row[i + 17])
						if e_val > E_TH and r_val > R_TH:
							name = base_name + "-AU" + AUS[i - 1]
							if name not in chunks[time]:
								chunks[time][name] = 0
							if base_name not in chunkcounts[time]:
								chunkcounts[time][base_name] = 0
							if COUNTING:
								chunks[time][name] += 1
							else:
								chunks[time][name] += r_val
							chunkcounts[time][base_name] += 1
				else:
					name = name
					if e > E_TH and r > R_TH:
						if name not in chunks[time]:
							chunks[time][name] = 0
						if base_name not in chunkcounts[time]:
							chunkcounts[time][base_name] = 0
						if COUNTING:
							chunks[time][name] += 1
						else:
							chunks[time][name] += r
						chunkcounts[time][base_name] += 1
				# the chunk array time participant is a dictionary of the participants in that chunk

				#data.append([frame, line[1], name, time])

for i in range(CHUNKING + 1):
	for name in chunks[i]:
		base_name = name.split("-")[0]
		if USE_AVERAGE:
			if chunkcounts[i][base_name] == 0:
				chunks[i][name] = 0
			else:
				chunks[i][name] /= chunkcounts[i][base_name]
		data.append([i, name, chunks[i][name]])

df = pd.DataFrame(data, columns=["Timeline", "Identity", "Value"])
df["Identity"] = df["Identity"].astype(str)
df["Identity"] = df["Identity"].apply(lambda x: x.split(" ")[0])

# Create a scatter plot of the data
fig = px.line(df, x="Timeline", y="Value", color="Identity", title="Face Recognition")
fig.show()

18


### CHUNKED PRESENCE WITH FILM MEANS

In [22]:
CHUNKING = 9

# User plotly to visualize something

import plotly.express as px
import pandas as pd

# Load all csv files in csv/
import glob
import os

csv_files = glob.glob("csv/*.csv")

USE_GENERAL_NAMES = True # Keep true
USE_GENDER_NAMES = False

USE_FILM_AVERAGE = True # Better keep true

IGNORE_MINOR_CHARACTERS = True

FILM_FRAME_TOTALS = {
	"Cliff": 2097,
	"Our Village": 2377,
	"Diary of Nurse": 2297,
	"Sabrina": 2751,
	"The Man from Laramie": 2477,
	"Human Desire": 2200,
	"Chikamatsu Story": 2460,
	"Black River": 2552,
	"Duel in the Sun": 3207,
	"Gilda": 2652,
	"Jigoku Mon": 2151,
	"Kurutta Kajitsu": 2076,
	"Notorious": 2433,
	"Oklahoma": 3577,
	"Vikings": 2740,
	"Waves of Life": 2465
}

RIVALS = ["Keming", "Professor Yuan", "Haoru", "David", "Vic", "Carl", "Ishun", "Hitokiri Joe", "Natsuhisa", "Bo Kang", "Wataru", "Jud", "Alex Sebastian", "Ballin", "Einar", "Jesse"]
MALES = ["Zhanwu", "Fan Jun", "Changping", "Linus", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Devlin", "Johnny", "Eric", "Lewt"]
LADIES = ["Shuzhen", "Fang Qing", "Suhua", "Sabrina", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Alicia", "Gilda", "Morgana", "Pearl"]

PROTAGONISTS = ["Zhanwu", "Fang Qing", "Suhua", "Sabrina", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Alicia", "Johnny", "Eric", "Pearl"]
LOVES = ["Shuzhen", "Fan Jun", "Changping", "Linus", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Devlin", "Gilda", "Morgana", "Lewt"]

def find_section(range_start, range_end, num_sections, value):
    """
    Given a range [range_start, range_end], this function splits it into
    num_sections and returns which section (1-based index) the value falls into.

    :param range_start: The start of the range (inclusive).
    :param range_end: The end of the range (inclusive).
    :param num_sections: Number of sections to divide the range into.
    :param value: The value for which we want to determine the section index.
    :return: The section index (1-based) where the value fits.
    """
    # Handle edge cases, e.g., if value < range_start or value > range_end
    if value <= range_start:
        return 1
    if value >= range_end:
        return num_sections

    # Compute the size of each section
    section_size = (range_end - range_start) / num_sections

    # Determine which section the value falls into using integer division
    section_index = int((value - range_start) // section_size) + 1

    # Make sure the section index doesn't exceed num_sections
    if section_index > num_sections:
        section_index = num_sections

    return section_index

def get_name(name):
	if USE_GENDER_NAMES:
		if name in MALES:
			name = "Male"
		elif name in LADIES:
			name = "Female"
	else:
		if name in PROTAGONISTS:
			name = "Protagonist"
		elif name in LOVES:
			name = "Love"
	return name

data = []
chunks = []
chunk_totals = {}
chunk_all_totals = []
for i in range(CHUNKING + 1):
	chunk_all_totals.append(0)
for i in range(CHUNKING + 1):
	chunks.append({})

for file in csv_files:
	film = file[4:-4]
	with open(file) as f:
		lines = f.readlines()[2:]
		for line in lines:
			line = line.split(",")
			if line[5] != "\"N/A\"":
				# Att 3 is frame numbder / film's total frames (the film is the name of the current csv file)
				total = FILM_FRAME_TOTALS[os.path.basename(file)[:-4]]
				frame = int(line[0][1:][:-1])
				name = line[5][1:][:-1]
				if USE_GENERAL_NAMES:
					if name in RIVALS:
						name = "Rival"
					name = get_name(name)
				if IGNORE_MINOR_CHARACTERS:
					if USE_GENERAL_NAMES:
						if name not in ["Rival", "Protagonist", "Female", "Male", "Love"]:
							continue
					else:
						if name not in RIVALS:
							if name not in PROTAGONISTS:
								if name not in LOVES:
									continue

				time = find_section(0, total, CHUNKING, frame)

				name = film + ":::" + name
				chunk_all_totals[time] += 1

				# the chunk array time participant is a dictionary of the participants in that chunk
				chunk_key = film + "---" + f"{time}"
				if chunk_key not in chunk_totals:
					chunk_totals[chunk_key] = 0

				chunk_totals[chunk_key] += 1
				if name not in chunks[time]:
					chunks[time][name] = 0
				chunks[time][name] += 1
				#data.append([frame, line[1], name, time])

for i in range(CHUNKING+1):
	names = {}
	for name in chunks[i]:
		# print(name)
		
		films = name.split(":::")
		real_name = films[1]
		real_film = films[0]

		total = chunk_totals[real_film + "---" + f"{i}"]
		character_in_chunk = chunks[i][name]
		avg = character_in_chunk / total
		if real_name not in names:
			names[real_name] = 0
		if USE_FILM_AVERAGE:
			names[real_name] += avg
		else:
			names[real_name] += character_in_chunk

	for name in names:
		chunk_total = names[name]
		real_avg = chunk_total / len(FILM_FRAME_TOTALS)

		data.append([i, name, real_avg])

df = pd.DataFrame(data, columns=["Timeline", "Identity", "Value"])
df["Identity"] = df["Identity"].astype(str)
df["Identity"] = df["Identity"].apply(lambda x: x.split(" ")[0])

# Create a scatter plot of the data
fig = px.line(df, x="Timeline", y="Value", color="Identity", title=f"Film-Average Temporal Presence <br><sub>{CHUNKING} Chunks</sub>", markers=True, color_discrete_sequence=px.colors.qualitative.Vivid)
fig.update_layout(
    plot_bgcolor='rgb(240, 240, 230)',
	font=dict(family="Georgia", size=14),
    title_font=dict(size=22),
    legend_title_font=dict(size=16),
	margin=dict(t=80, b=40, l=40, r=40), height=460
)
fig.update_xaxes(showgrid=True, gridcolor='rgb(200, 190, 150)', zeroline=False)
fig.update_yaxes(showgrid=True, gridcolor='rgb(200, 190, 150)', zeroline=False)
fig.show()

### CHUNKED SIZE WITH FILM MEANS

In [4]:
CHUNKING = 9

# User plotly to visualize something

import plotly.express as px
import pandas as pd

# Load all csv files in csv/
import glob
import os

csv_files = glob.glob("csv/*.csv")

USE_GENERAL_NAMES = True # Keep true
USE_GENDER_NAMES = False

USE_INDIVIDUAL_AVERAGE_NOT_THREE = True # Keep true

USE_FILM_AVERAGE = True # Keep true

IGNORE_MINOR_CHARACTERS = True

FILM_FRAME_TOTALS = {
	"Cliff": 2097,
	"Our Village": 2377,
	"Diary of Nurse": 2297,
	"Sabrina": 2751,
	"The Man from Laramie": 2477,
	"Human Desire": 2200,
	"Chikamatsu Story": 2460,
	"Black River": 2552,
	"Duel in the Sun": 3207,
	"Gilda": 2652,
	"Jigoku Mon": 2151,
	"Kurutta Kajitsu": 2076,
	"Notorious": 2433,
	"Oklahoma": 3577,
	"Vikings": 2740,
	"Waves of Life": 2465
}

RIVALS = ["Keming", "Professor Yuan", "Haoru", "David", "Vic", "Carl", "Ishun", "Hitokiri Joe", "Natsuhisa", "Bo Kang", "Wataru", "Jud", "Alex Sebastian", "Ballin", "Einar", "Jesse"]
MALES = ["Zhanwu", "Fan Jun", "Changping", "Linus", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Devlin", "Johnny", "Eric", "Lewt"]
LADIES = ["Shuzhen", "Fang Qing", "Suhua", "Sabrina", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Alicia", "Gilda", "Morgana", "Pearl"]

PROTAGONISTS = ["Zhanwu", "Fang Qing", "Suhua", "Sabrina", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Alicia", "Johnny", "Eric", "Pearl"]
LOVES = ["Shuzhen", "Fan Jun", "Changping", "Linus", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Devlin", "Gilda", "Morgana", "Lewt"]

def find_section(range_start, range_end, num_sections, value):
    """
    Given a range [range_start, range_end], this function splits it into
    num_sections and returns which section (1-based index) the value falls into.

    :param range_start: The start of the range (inclusive).
    :param range_end: The end of the range (inclusive).
    :param num_sections: Number of sections to divide the range into.
    :param value: The value for which we want to determine the section index.
    :return: The section index (1-based) where the value fits.
    """
    # Handle edge cases, e.g., if value < range_start or value > range_end
    if value <= range_start:
        return 1
    if value >= range_end:
        return num_sections

    # Compute the size of each section
    section_size = (range_end - range_start) / num_sections

    # Determine which section the value falls into using integer division
    section_index = int((value - range_start) // section_size) + 1

    # Make sure the section index doesn't exceed num_sections
    if section_index > num_sections:
        section_index = num_sections

    return section_index

def get_name(name):
	if USE_GENDER_NAMES:
		if name in MALES:
			name = "Male"
		elif name in LADIES:
			name = "Female"
	else:
		if name in PROTAGONISTS:
			name = "Protagonist"
		elif name in LOVES:
			name = "Love"
	return name

data = []
chunks = []
chunk_totals = {}
chunk_all_totals = []
for i in range(CHUNKING + 1):
	chunk_all_totals.append(0)
for i in range(CHUNKING + 1):
	chunks.append({})

for file in csv_files:
	film = file[4:-4]
	with open(file) as f:
		lines = f.readlines()[2:]
		for line in lines:
			line = line.split(",")
			if line[5] != "\"N/A\"":
				# Att 3 is frame numbder / film's total frames (the film is the name of the current csv file)
				total = FILM_FRAME_TOTALS[os.path.basename(file)[:-4]]
				frame = int(line[0][1:][:-1])
				name = line[5][1:][:-1]
				face_box_text = line[1][1:][:-1][1:][:-1]
				face_box = [float(x) for x in face_box_text.split("\， ")]
				face_box_width = face_box[2]
				face_box_height = face_box[3]
				if USE_GENERAL_NAMES:
					if name in RIVALS:
						name = "Rival"
					name = get_name(name)
				if IGNORE_MINOR_CHARACTERS:
					if USE_GENERAL_NAMES:
						if name not in ["Rival", "Protagonist", "Female", "Male", "Love"]:
							continue
					else:
						if name not in RIVALS:
							if name not in PROTAGONISTS:
								if name not in LOVES:
									continue

				time = find_section(0, total, CHUNKING, frame)

				name = film + ":::" + name
				chunk_all_totals[time] += 1

				# the chunk array time participant is a dictionary of the participants in that chunk
				if USE_INDIVIDUAL_AVERAGE_NOT_THREE:
					chunk_key = name + "---" + f"{time}"
				else:
					chunk_key = film + "---" + f"{time}"
				if chunk_key not in chunk_totals:
					chunk_totals[chunk_key] = 0

				chunk_totals[chunk_key] += 1
				if name not in chunks[time]:
					chunks[time][name] = 0
				chunks[time][name] += face_box_height * face_box_width
				#data.append([frame, line[1], name, time])

for i in range(CHUNKING + 1):
	names = {}
	for name in chunks[i]:
		
		
		films = name.split(":::")
		real_name = films[1]
		real_film = films[0]

		if USE_INDIVIDUAL_AVERAGE_NOT_THREE:
			total = chunk_totals[name + "---" + f"{i}"]
		else:
			total = chunk_totals[real_film + "---" + f"{i}"]
		character_in_chunk = chunks[i][name]
		avg = character_in_chunk / total
		if real_name not in names:
			names[real_name] = 0
		if USE_FILM_AVERAGE:
			names[real_name] += avg
		else:
			names[real_name] += character_in_chunk

	for name in names:
		chunk_total = names[name]
		real_avg = chunk_total / len(FILM_FRAME_TOTALS)

		data.append([i, name, real_avg])

df = pd.DataFrame(data, columns=["Timeline", "Identity", "Value"])
df["Identity"] = df["Identity"].astype(str)
df["Identity"] = df["Identity"].apply(lambda x: x.split(" ")[0])

# Create a scatter plot of the data
fig = px.line(df, x="Timeline", y="Value", color="Identity", title=f"Film-Average Face Box Size <br><sub>{CHUNKING} Chunks</sub>", markers=True, color_discrete_sequence=px.colors.qualitative.Vivid)
fig.update_layout(
    plot_bgcolor='rgb(240, 240, 230)',
	font=dict(family="Georgia", size=14),
    title_font=dict(size=22),
    legend_title_font=dict(size=16),
	margin=dict(t=80, b=40, l=40, r=40), height=460
)
fig.update_xaxes(showgrid=True, gridcolor='rgb(200, 190, 150)', zeroline=False)
fig.update_yaxes(showgrid=True, gridcolor='rgb(200, 190, 150)', zeroline=False)
fig.show()

### DIFFERENCE FROM AVG

In [1]:
CHUNKING = 9

# User plotly to visualize something

import plotly.express as px
import pandas as pd

# Load all csv files in csv/
import glob
import os

csv_files = glob.glob("csv/*.csv")

USE_GENDER_NAMES = False

USE_FILM_AVERAGE = True # Better keep true

ONLY_LOOK_AT_SECONDLAST_AS_ENDING = False

FILM_FRAME_TOTALS = {
	"Cliff": 2097,
	"Our Village": 2377,
	"Diary of Nurse": 2297,
	"Sabrina": 2751,
	"The Man from Laramie": 2477,
	"Human Desire": 2200,
	"Chikamatsu Story": 2460,
	"Black River": 2552,
	"Duel in the Sun": 3207,
	"Gilda": 2652,
	"Jigoku Mon": 2151,
	"Kurutta Kajitsu": 2076,
	"Notorious": 2433,
	"Oklahoma": 3577,
	"Vikings": 2740,
	"Waves of Life": 2465
}

RIVALS = ["Keming", "Professor Yuan", "Haoru", "David", "Vic", "Carl", "Ishun", "Hitokiri Joe", "Natsuhisa", "Bo Kang", "Wataru", "Jud", "Alex Sebastian", "Ballin", "Einar", "Jesse"]
MALES = ["Zhanwu", "Fan Jun", "Changping", "Linus", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Devlin", "Johnny", "Eric", "Lewt"]
LADIES = ["Shuzhen", "Fang Qing", "Suhua", "Sabrina", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Alicia", "Gilda", "Morgana", "Pearl"]

PROTAGONISTS = ["Zhanwu", "Fang Qing", "Suhua", "Sabrina", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Alicia", "Johnny", "Eric", "Pearl"]
LOVES = ["Shuzhen", "Fan Jun", "Changping", "Linus", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Devlin", "Gilda", "Morgana", "Lewt"]

def find_section(range_start, range_end, num_sections, value):
    """
    Given a range [range_start, range_end], this function splits it into
    num_sections and returns which section (1-based index) the value falls into.

    :param range_start: The start of the range (inclusive).
    :param range_end: The end of the range (inclusive).
    :param num_sections: Number of sections to divide the range into.
    :param value: The value for which we want to determine the section index.
    :return: The section index (1-based) where the value fits.
    """
    # Handle edge cases, e.g., if value < range_start or value > range_end
    if value <= range_start:
        return 1
    if value >= range_end:
        return num_sections

    # Compute the size of each section
    section_size = (range_end - range_start) / num_sections

    # Determine which section the value falls into using integer division
    section_index = int((value - range_start) // section_size) + 1

    # Make sure the section index doesn't exceed num_sections
    if section_index > num_sections:
        section_index = num_sections

    return section_index

def get_name(name):
	if USE_GENDER_NAMES:
		if name in MALES:
			name = "Male"
		elif name in LADIES:
			name = "Female"
	else:
		if name in PROTAGONISTS:
			name = "Protagonist"
		elif name in LOVES:
			name = "Love"
	return name

data = []
chunks = []
chunk_totals = {}
chunk_all_totals = []
name_totals = {}
individual_chunks = []
film_totals = {}
most_significant_rivals_at_end = {}
all_names = []

for i in range(CHUNKING + 1):
	chunk_all_totals.append(0)
for i in range(CHUNKING + 1):
	chunks.append({})
	individual_chunks.append({})

for file in csv_files:
	film = file[4:-4]
	with open(file) as f:
		lines = f.readlines()[2:]
		for line in lines:
			line = line.split(",")
			if line[5] != "\"N/A\"":
				# Att 3 is frame numbder / film's total frames (the film is the name of the current csv file)
				total = FILM_FRAME_TOTALS[os.path.basename(file)[:-4]]
				frame = int(line[0][1:][:-1])
				name = line[5][1:][:-1]
				genName = name
				
				if name in RIVALS:
					genName = "Rival"
				genName = get_name(genName)
				if genName not in ["Rival", "Protagonist", "Female", "Male", "Love"]:
					continue

				time = find_section(0, total, CHUNKING, frame)

				name = film + ":::" + name
				if name not in all_names:
					all_names.append(name)
				genName = film + ":::" + genName
				chunk_all_totals[time] += 1

				# the chunk array time participant is a dictionary of the participants in that chunk
				chunk_key = film + "---" + f"{time}"
				if chunk_key not in chunk_totals:
					chunk_totals[chunk_key] = 0

				chunk_totals[chunk_key] += 1
				if genName not in chunks[time]:
					chunks[time][genName] = 0
				if name not in individual_chunks[time]:
					individual_chunks[time][name] = 0
				chunks[time][genName] += 1
				individual_chunks[time][name] += 1

				#data.append([frame, line[1], name, time])

	if film not in film_totals:
		film_totals[film] = 0

for i in range(CHUNKING + 1):
	names = {}
	for name in chunks[i]:
		# print(name)
		
		films = name.split(":::")
		real_name = films[1]
		real_film = films[0]

		total = chunk_totals[real_film + "---" + f"{i}"]
		character_in_chunk = chunks[i][name]
		avg = character_in_chunk / total
		if real_name not in names:
			names[real_name] = 0
		if USE_FILM_AVERAGE:
			names[real_name] += avg
		else:
			names[real_name] += character_in_chunk
		
		if name not in all_names:
			all_names.append(name)

	for name in names:
		chunk_total = names[name]
		real_avg = chunk_total / len(FILM_FRAME_TOTALS)

		data.append([i, name, real_avg])

df = pd.DataFrame(data, columns=["Timeline", "Identity", "Value"])
df["Identity"] = df["Identity"].astype(str)
df["Identity"] = df["Identity"].apply(lambda x: x.split(" ")[0])

# Create a scatter plot of the data
fig = px.line(df, x="Timeline", y="Value", color="Identity", title=f"Film-Average Temporal Presence <br><sub>{CHUNKING} Chunks</sub>")
#fig.show()

data2 = []

for c in range(1, CHUNKING+1):
	for name in all_names:
		if name.split(":::")[1] not in RIVALS + LOVES + PROTAGONISTS:
			continue
		n = name.split(":::")[0] + ":::" + name.split(":::")[1]
		if n not in individual_chunks[c]:
			individual_chunks[c][n] = 0

for i in range(CHUNKING + 1):
	names = {}
	for name in individual_chunks[i]:
		# print(name)

		iv = -999
		
		films = name.split(":::")
		real_name = films[1]
		real_film = films[0]
		gen_name = real_name
		if gen_name in RIVALS:
			gen_name = "Rival"
		gen_name = get_name(gen_name)

		for d in data:
			if d[0] == i and d[1] == gen_name:
				mean = d[2]
				selfVal = individual_chunks[i][name]
				selfMean = selfVal / chunk_totals[real_film + "---" + f"{i}"]
				iv = abs(mean - selfMean)
				data2.append([i, real_name, selfMean, gen_name, real_film])
				#print(f"{iv},{real_name} ({gen_name}),{real_film}")
				break
		
		if iv < -998:
			raise Exception("Difference not found")
		
		newName = name + ":::" + gen_name
		
		if newName not in name_totals:
			name_totals[newName] = 0
		name_totals[newName] += iv

		if gen_name == "Rival" and i > CHUNKING-2:
			if (ONLY_LOOK_AT_SECONDLAST_AS_ENDING and i == CHUNKING - 1) or not ONLY_LOOK_AT_SECONDLAST_AS_ENDING:
				if name not in most_significant_rivals_at_end:
					most_significant_rivals_at_end[name] = 0
				most_significant_rivals_at_end[name] += selfMean
		
		film_totals[real_film] += iv

for c in range(1, CHUNKING + 1):
	for name in ["Rival", "Protagonist", "Love"]:
		mean = 0
		for d in data:
			if d[0] == c and d[1] == name:
				mean = d[2]
				break
		data2.append([c, name, mean, name, "FilmAverage"])

name_totals = sorted(name_totals.items(), key = lambda x:x[1], reverse = True)
name_totals = dict(name_totals)
film_totals = sorted(film_totals.items(), key = lambda x:x[1], reverse = True)
film_totals = dict(film_totals)
most_significant_rivals_at_end = sorted(most_significant_rivals_at_end.items(), key = lambda x:x[1])
most_significant_rivals_at_end = dict(most_significant_rivals_at_end)
		
print("Most close-to-average characters:\n")
for p in name_totals:
	s = p.split(":::")
	print(f"{s[1]},{s[0]} ({s[2]}),{name_totals[p]}")

print("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nMost close-to-average films:\n")

for f in film_totals:
	print(f"{f},{film_totals[f]}")

print(f"\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nMost significant rivals at ending (Only 2nd last: {ONLY_LOOK_AT_SECONDLAST_AS_ENDING}):\n")

for i in most_significant_rivals_at_end:
	print(f"{i},{most_significant_rivals_at_end[i]}")
			
df = pd.DataFrame(data2, columns=["Timeline", "Identity", "Value", "Role", "Title"])
df["Identity"] = df["Identity"].astype(str)
df["Identity"] = df["Identity"].apply(lambda x: x.split(" ")[0])
df.to_csv("All_Chunked_Presence_Normalized_in_All_Movies.csv")


# Create a scatter plot of the data
fig = px.line(df, x="Timeline", y="Value", color="Identity", title=f"Film-Average Temporal Presence <br><sub>{CHUNKING} Chunks</sub>", markers=True)
fig.show()

#df.to_csv("All_Mean")

Most close-to-average characters:

Einar,Vikings (Rival),2.623784853881044
Morgana,Vikings (Love),2.410846219648448
Kesa,Jigoku Mon (Love),2.3394464900169227
Jin Zhang,Waves of Life (Protagonist),2.236377150040476
Eric,Vikings (Protagonist),2.2189298038284315
Will,The Man from Laramie (Protagonist),2.0746699482179225
Laurey,Oklahoma (Love),2.061160877542642
Fan Jun,Cliff (Love),1.9703149774409487
Moritoo,Jigoku Mon (Protagonist),1.8267264797631448
Vic,The Man from Laramie (Rival),1.8125375331051141
Pearl,Duel in the Sun (Protagonist),1.792102509452043
Jeff,Human Desire (Protagonist),1.7657400868895614
Osan,Chikamatsu Story (Love),1.747482856808809
Suhua,Diary of Nurse (Protagonist),1.7379857050104464
Gilda,Gilda (Love),1.707711723178086
Curly,Oklahoma (Protagonist),1.6874759817930725
Nishida,Black River (Protagonist),1.6460746834609232
Linus,Sabrina (Love),1.592133726475087
Fang Qing,Cliff (Protagonist),1.5684838683632085
Changping,Diary of Nurse (Love),1.5476461484147457
Wataru,Jigoku

In [16]:
CHUNKING = 9

# User plotly to visualize something

import plotly.express as px
import pandas as pd

# Load all csv files in csv/
import glob
import os

csv_files = glob.glob("csv/*.csv")
USE_GENDER_NAMES = False

USE_INDIVIDUAL_AVERAGE_NOT_THREE = True # Keep true

USE_FILM_AVERAGE = True # Keep true

IGNORE_MINOR_CHARACTERS = True

FILM_FRAME_TOTALS = {
	"Cliff": 2097,
	"Our Village": 2377,
	"Diary of Nurse": 2297,
	"Sabrina": 2751,
	"The Man from Laramie": 2477,
	"Human Desire": 2200,
	"Chikamatsu Story": 2460,
	"Black River": 2552,
	"Duel in the Sun": 3207,
	"Gilda": 2652,
	"Jigoku Mon": 2151,
	"Kurutta Kajitsu": 2076,
	"Notorious": 2433,
	"Oklahoma": 3577,
	"Vikings": 2740,
	"Waves of Life": 2465
}

RIVALS = ["Keming", "Professor Yuan", "Haoru", "David", "Vic", "Carl", "Ishun", "Hitokiri Joe", "Natsuhisa", "Bo Kang", "Wataru", "Jud", "Alex Sebastian", "Ballin", "Einar", "Jesse"]
MALES = ["Zhanwu", "Fan Jun", "Changping", "Linus", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Devlin", "Johnny", "Eric", "Lewt"]
LADIES = ["Shuzhen", "Fang Qing", "Suhua", "Sabrina", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Alicia", "Gilda", "Morgana", "Pearl"]

PROTAGONISTS = ["Zhanwu", "Fang Qing", "Suhua", "Sabrina", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Alicia", "Johnny", "Eric", "Pearl"]
LOVES = ["Shuzhen", "Fan Jun", "Changping", "Linus", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Devlin", "Gilda", "Morgana", "Lewt"]

def find_section(range_start, range_end, num_sections, value):
    """
    Given a range [range_start, range_end], this function splits it into
    num_sections and returns which section (1-based index) the value falls into.

    :param range_start: The start of the range (inclusive).
    :param range_end: The end of the range (inclusive).
    :param num_sections: Number of sections to divide the range into.
    :param value: The value for which we want to determine the section index.
    :return: The section index (1-based) where the value fits.
    """
    # Handle edge cases, e.g., if value < range_start or value > range_end
    if value <= range_start:
        return 1
    if value >= range_end:
        return num_sections

    # Compute the size of each section
    section_size = (range_end - range_start) / num_sections

    # Determine which section the value falls into using integer division
    section_index = int((value - range_start) // section_size) + 1

    # Make sure the section index doesn't exceed num_sections
    if section_index > num_sections:
        section_index = num_sections

    return section_index

def get_name(name):
	if USE_GENDER_NAMES:
		if name in MALES:
			name = "Male"
		elif name in LADIES:
			name = "Female"
	else:
		if name in PROTAGONISTS:
			name = "Protagonist"
		elif name in LOVES:
			name = "Love"
	return name

data = []
chunks = []
chunk_totals = {}
chunk_all_totals = []
name_to_film = {}
for i in range(CHUNKING + 1):
	chunk_all_totals.append(0)
for i in range(CHUNKING + 1):
	chunks.append({})

for file in csv_files:
	film = file[4:-4]
	with open(file) as f:
		lines = f.readlines()[2:]
		for line in lines:
			line = line.split(",")
			if line[5] != "\"N/A\"":
				# Att 3 is frame numbder / film's total frames (the film is the name of the current csv file)
				total = FILM_FRAME_TOTALS[os.path.basename(file)[:-4]]
				frame = int(line[0][1:][:-1])
				name = line[5][1:][:-1]
				face_box_text = line[1][1:][:-1][1:][:-1]
				face_box = [float(x) for x in face_box_text.split("\， ")]
				face_box_width = face_box[2]
				gen_name = name
				face_box_height = face_box[3]
				if name in RIVALS:
					gen_name = "Rival"
				gen_name = get_name(gen_name)
				if IGNORE_MINOR_CHARACTERS:
					if name not in RIVALS:
						if name not in PROTAGONISTS:
							if name not in LOVES:
								continue

				character_name = name
				time = find_section(0, total, CHUNKING, frame)

				if name not in name_to_film:
					name_to_film[name] = film

				name = film + ":::" + name
				chunk_all_totals[time] += 1

				# the chunk array time participant is a dictionary of the participants in that chunk
				if USE_INDIVIDUAL_AVERAGE_NOT_THREE:
					chunk_key = name + "---" + f"{time}"
				else:
					chunk_key = film + "---" + f"{time}"
				if chunk_key not in chunk_totals:
					chunk_totals[chunk_key] = 0

				chunk_totals[chunk_key] += 1
				if name not in chunks[time]:
					chunks[time][name] = 0
				chunks[time][name] += face_box_height * face_box_width
				normalized_time = frame / total
				if time >= 4 and time <= 7:
					data.append([normalized_time, character_name, face_box_height * face_box_height, gen_name, film])

data2 = []

for i in range(CHUNKING + 1):
	names = {}
	for name in chunks[i]:
		films = name.split(":::")
		real_name = films[1]
		real_film = films[0]

		if USE_INDIVIDUAL_AVERAGE_NOT_THREE:
			total = chunk_totals[name + "---" + f"{i}"]
		else:
			total = chunk_totals[real_film + "---" + f"{i}"]
		character_in_chunk = chunks[i][name]
		avg = character_in_chunk / total
		if real_name not in names:
			names[real_name] = 0
		if USE_FILM_AVERAGE:
			names[real_name] += avg
		else:
			names[real_name] += character_in_chunk

	for name in names:
		chunk_total = names[name]
		real_avg = chunk_total / len(FILM_FRAME_TOTALS)

		gen_name = name
		if name in RIVALS:
			gen_name = "Rival"
		gen_name = get_name(gen_name)
		data2.append([i, name, real_avg, gen_name, name_to_film[name]])

df = pd.DataFrame(data, columns=["TimeNormalized", "Identity", "Value", "Role", "Title"])
df["Identity"] = df["Identity"].astype(str)
df["Identity"] = df["Identity"].apply(lambda x: x.split(" ")[0])

df.to_csv("All_Unchunked_Size_in_All_Movies(4-7).csv")

df2 = pd.DataFrame(data2, columns=["Timeline", "Identity", "Value", "Role", "Title"])
df2.to_csv("All_Chunked_Size_in_All_Movies.csv")

# Create a scatter plot of the data
fig = px.scatter(df, x="TimeNormalized", y="Value", color="Identity", title=f"Film-Average Face Box Size <br><sub>{CHUNKING} Chunks</sub>", color_discrete_sequence=px.colors.qualitative.Vivid)
fig.update_layout(
    plot_bgcolor='rgb(240, 240, 230)',
	font=dict(family="Georgia", size=14),
    title_font=dict(size=22),
    legend_title_font=dict(size=16),
	margin=dict(t=80, b=40, l=40, r=40), height=460
)
fig.update_xaxes(showgrid=True, gridcolor='rgb(200, 190, 150)', zeroline=False)
fig.update_yaxes(showgrid=True, gridcolor='rgb(200, 190, 150)', zeroline=False)
fig.show()

### STANDARD DEVIATION (PER FILM vs FILM AVERAGE)

In [15]:
import plotly.express as px
import pandas as pd
import glob
import os
import numpy as np  # Needed for standard deviation

CHUNKING = 9

csv_files = glob.glob("csv/*.csv")

USE_GENERAL_NAMES = True
USE_GENDER_NAMES = False
USE_FILM_AVERAGE = True  # Still used as a switch, now for SD across films
IGNORE_MINOR_CHARACTERS = True

FILM_FRAME_TOTALS = {
	"Cliff": 2097,
	"Our Village": 2377,
	"Diary of Nurse": 2297,
	"Sabrina": 2751,
	"The Man from Laramie": 2477,
	"Human Desire": 2200,
	"Chikamatsu Story": 2460,
	"Black River": 2552,
	"Duel in the Sun": 3207,
	"Gilda": 2652,
	"Jigoku Mon": 2151,
	"Kurutta Kajitsu": 2076,
	"Notorious": 2433,
	"Oklahoma": 3577,
	"Vikings": 2740,
	"Waves of Life": 2465
}

RIVALS = ["Keming", "Professor Yuan", "Haoru", "David", "Vic", "Carl", "Ishun", "Hitokiri Joe", "Natsuhisa", "Bo Kang", "Wataru", "Jud", "Alex Sebastian", "Ballin", "Einar", "Jesse"]
MALES = ["Zhanwu", "Fan Jun", "Changping", "Linus", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Devlin", "Johnny", "Eric", "Lewt"]
LADIES = ["Shuzhen", "Fang Qing", "Suhua", "Sabrina", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Alicia", "Gilda", "Morgana", "Pearl"]

PROTAGONISTS = ["Zhanwu", "Fang Qing", "Suhua", "Sabrina", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Alicia", "Johnny", "Eric", "Pearl"]
LOVES = ["Shuzhen", "Fan Jun", "Changping", "Linus", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Devlin", "Gilda", "Morgana", "Lewt"]

def find_section(range_start, range_end, num_sections, value):
    if value <= range_start:
        return 1
    if value >= range_end:
        return num_sections
    section_size = (range_end - range_start) / num_sections
    section_index = int((value - range_start) // section_size) + 1
    if section_index > num_sections:
        section_index = num_sections
    return section_index

def get_name(name):
    if USE_GENDER_NAMES:
        if name in MALES:
            name = "Male"
        elif name in LADIES:
            name = "Female"
    else:
        if name in PROTAGONISTS:
            name = "Protagonist"
        elif name in LOVES:
            name = "Love"
    return name

data = []
chunks = []
chunk_totals = {}
chunk_all_totals = []
for i in range(CHUNKING + 1):
    chunk_all_totals.append(0)
for i in range(CHUNKING + 1):
    chunks.append({})

for file in csv_files:
    film = file[4:-4]
    with open(file) as f:
        lines = f.readlines()[2:]
        for line in lines:
            line = line.split(",")
            if line[5] != "\"N/A\"":
                total = FILM_FRAME_TOTALS[os.path.basename(file)[:-4]]
                frame = int(line[0][1:][:-1])
                name = line[5][1:][:-1]
                if USE_GENERAL_NAMES:
                    if name in RIVALS:
                        name = "Rival"
                    name = get_name(name)
                if IGNORE_MINOR_CHARACTERS:
                    if USE_GENERAL_NAMES:
                        if name not in ["Rival", "Protagonist", "Female", "Male", "Love"]:
                            continue
                    else:
                        if name not in RIVALS and name not in PROTAGONISTS and name not in LOVES:
                            continue

                time = find_section(0, total, CHUNKING, frame)
                name = film + ":::" + name
                chunk_all_totals[time] += 1
                chunk_key = film + "---" + f"{time}"
                if chunk_key not in chunk_totals:
                    chunk_totals[chunk_key] = 0
                chunk_totals[chunk_key] += 1
                if name not in chunks[time]:
                    chunks[time][name] = 0
                chunks[time][name] += 1

for i in range(CHUNKING + 1):
    names = {}
    for name in chunks[i]:
        films = name.split(":::")
        real_name = films[1]
        real_film = films[0]
        total = chunk_totals[real_film + "---" + f"{i}"]
        character_in_chunk = chunks[i][name]
        avg = character_in_chunk / total
        if real_name not in names:
            names[real_name] = []
        if USE_FILM_AVERAGE:
            names[real_name].append(avg)
        else:
            names[real_name].append(character_in_chunk)

    for name, vals in names.items():
        real_sd = np.std(vals, ddof=0) if vals else 0.0
        data.append([i, name, real_sd])
        
for i in range(1, CHUNKING + 1):
    total = 0
    for d in data:
        if d[0] == i:
            total += d[2]
    data.append([i, "Average", total / 3.0])

df = pd.DataFrame(data, columns=["Timeline", "Identity", "Value"])
df["Identity"] = df["Identity"].astype(str)
df["Identity"] = df["Identity"].apply(lambda x: x.split(" ")[0])

fig = px.line(df, x="Timeline", y="Value", color="Identity",
              title=f"Standard Deviation of Temporal Presence<br><sub>{CHUNKING} Chunks</sub>",
              markers=True,
              color_discrete_sequence=px.colors.qualitative.Vivid)

fig.update_layout(
    plot_bgcolor='rgb(240, 240, 230)',
    font=dict(family="Georgia", size=14),
    title_font=dict(size=22),
    legend_title_font=dict(size=16),
    margin=dict(t=80, b=40, l=40, r=40),
    height=460
)
fig.update_xaxes(showgrid=True, gridcolor='rgb(200, 190, 150)', zeroline=False)
fig.update_yaxes(showgrid=True, gridcolor='rgb(200, 190, 150)', zeroline=False)
fig.show()


### SIZE SD

In [16]:
import plotly.express as px
import pandas as pd
import numpy as np
import glob
import os

CHUNKING = 9

csv_files = glob.glob("csv/*.csv")

USE_GENERAL_NAMES = True
USE_GENDER_NAMES = False
USE_INDIVIDUAL_AVERAGE_NOT_THREE = True
USE_FILM_AVERAGE = True
IGNORE_MINOR_CHARACTERS = True

FILM_FRAME_TOTALS = {
    "Cliff": 2097, "Our Village": 2377, "Diary of Nurse": 2297, "Sabrina": 2751,
    "The Man from Laramie": 2477, "Human Desire": 2200, "Chikamatsu Story": 2460,
    "Black River": 2552, "Duel in the Sun": 3207, "Gilda": 2652, "Jigoku Mon": 2151,
    "Kurutta Kajitsu": 2076, "Notorious": 2433, "Oklahoma": 3577,
    "Vikings": 2740, "Waves of Life": 2465
}

RIVALS = ["Keming", "Professor Yuan", "Haoru", "David", "Vic", "Carl", "Ishun", "Hitokiri Joe", "Natsuhisa", "Bo Kang", "Wataru", "Jud", "Alex Sebastian", "Ballin", "Einar", "Jesse"]
MALES = ["Zhanwu", "Fan Jun", "Changping", "Linus", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Devlin", "Johnny", "Eric", "Lewt"]
LADIES = ["Shuzhen", "Fang Qing", "Suhua", "Sabrina", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Alicia", "Gilda", "Morgana", "Pearl"]
PROTAGONISTS = ["Zhanwu", "Fang Qing", "Suhua", "Sabrina", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Alicia", "Johnny", "Eric", "Pearl"]
LOVES = ["Shuzhen", "Fan Jun", "Changping", "Linus", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Devlin", "Gilda", "Morgana", "Lewt"]

def find_section(range_start, range_end, num_sections, value):
    if value <= range_start:
        return 1
    if value >= range_end:
        return num_sections
    section_size = (range_end - range_start) / num_sections
    section_index = int((value - range_start) // section_size) + 1
    return min(section_index, num_sections)

def get_name(name):
    if USE_GENDER_NAMES:
        if name in MALES:
            return "Male"
        elif name in LADIES:
            return "Female"
    else:
        if name in PROTAGONISTS:
            return "Protagonist"
        elif name in LOVES:
            return "Love"
    return name

data = []
chunks = [{} for _ in range(CHUNKING + 1)]
chunk_totals = {}

for file in csv_files:
    film = file[4:-4]
    with open(file) as f:
        lines = f.readlines()[2:]
        for line in lines:
            line = line.split(",")
            if line[5] != "\"N/A\"":
                total = FILM_FRAME_TOTALS[film]
                frame = int(line[0][1:][:-1])
                name = line[5][1:][:-1]
                face_box_text = line[1][1:][:-1][1:][:-1]
                face_box = [float(x) for x in face_box_text.split("\， ")]
                face_box_width, face_box_height = face_box[2], face_box[3]
                if USE_GENERAL_NAMES:
                    if name in RIVALS:
                        name = "Rival"
                    name = get_name(name)
                if IGNORE_MINOR_CHARACTERS and name not in ["Rival", "Protagonist", "Female", "Male", "Love"]:
                    continue

                time = find_section(0, total, CHUNKING, frame)
                full_name = film + ":::" + name
                if USE_INDIVIDUAL_AVERAGE_NOT_THREE:
                    chunk_key = full_name + "---" + str(time)
                else:
                    chunk_key = film + "---" + str(time)

                if chunk_key not in chunk_totals:
                    chunk_totals[chunk_key] = 0
                chunk_totals[chunk_key] += 1

                if full_name not in chunks[time]:
                    chunks[time][full_name] = 0
                chunks[time][full_name] += face_box_width * face_box_height

# Now compute standard deviation
for i in range(CHUNKING + 1):
    names = {}
    for full_name in chunks[i]:
        film, real_name = full_name.split(":::")
        if USE_INDIVIDUAL_AVERAGE_NOT_THREE:
            total = chunk_totals[full_name + "---" + str(i)]
        else:
            total = chunk_totals[film + "---" + str(i)]

        area = chunks[i][full_name]
        avg_area = area / total
        if real_name not in names:
            names[real_name] = []
        names[real_name].append(avg_area)

    for name, vals in names.items():
        real_sd = np.std(vals, ddof=0) if vals else 0.0
        data.append([i, name, real_sd])

for i in range(1, CHUNKING + 1):
    total = 0
    for d in data:
        if d[0] == i:
            total += d[2]
    data.append([i, "Average", total/3.0])

df = pd.DataFrame(data, columns=["Timeline", "Identity", "Value"])
df["Identity"] = df["Identity"].astype(str).apply(lambda x: x.split(" ")[0])

fig = px.line(df, x="Timeline", y="Value", color="Identity",
              title=f"Standard Deviation of Face Box Size <br><sub>{CHUNKING} Chunks</sub>",
              markers=True, color_discrete_sequence=px.colors.qualitative.Vivid)
fig.update_layout(
    plot_bgcolor='rgb(240, 240, 230)',
    font=dict(family="Georgia", size=14),
    title_font=dict(size=22),
    legend_title_font=dict(size=16),
    margin=dict(t=80, b=40, l=40, r=40), height=460
)
fig.update_xaxes(showgrid=True, gridcolor='rgb(200, 190, 150)', zeroline=False)
fig.update_yaxes(showgrid=True, gridcolor='rgb(200, 190, 150)', zeroline=False)
fig.show()

### FILM-SENSITIVE PER FILM TIME

In [29]:
CHUNKING = 9

# Only use this for one film
# User plotly to visualize something

import plotly.express as px
import pandas as pd

# Load all csv files in csv/
import glob
import os

csv_files = glob.glob(f"csv/fading-out/*.csv")
# Convert all entries to structured data with attribute 0 as frame number, 1 as Face Box, and 5 as Identity
# Skip first two lines of each file
# Skip rows where Identity is N/A

USE_GENERAL_NAMES = False
USE_GENDER_NAMES = False

USE_AVERAGE = False

IGNORE_MINOR_CHARACTERS = True

FILM_FRAME_TOTALS = {
	"Cliff": 2097,
	"Our Village": 2377,
	"Diary of Nurse": 2297,
	"Sabrina": 2751,
	"The Man from Laramie": 2477,
	"Human Desire": 2200,
	"Chikamatsu Story": 2460,
	"Black River": 2552,
	"Duel in the Sun": 3207,
	"Gilda": 2652,
	"Jigoku Mon": 2151,
	"Kurutta Kajitsu": 2076,
	"Notorious": 2433,
	"Oklahoma": 3577,
	"Vikings": 2740,
	"Waves of Life": 2465
}

RIVALS = ["Keming", "Professor Yuan", "Haoru", "David", "Vic", "Carl", "Ishun", "Hitokiri Joe", "Natsuhisa", "Bo Kang", "Wataru", "Jud", "Alex Sebastian", "Ballin", "Einar", "Jesse"]
MALES = ["Zhanwu", "Fan Jun", "Changping", "Linus", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Devlin", "Johnny", "Eric", "Lewt"]
LADIES = ["Shuzhen", "Fang Qing", "Suhua", "Sabrina", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Alicia", "Gilda", "Morgana", "Pearl"]

PROTAGONISTS = ["Zhanwu", "Fang Qing", "Suhua", "Sabrina", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Alicia", "Johnny", "Eric", "Pearl"]
LOVES = ["Shuzhen", "Fan Jun", "Changping", "Linus", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Devlin", "Gilda", "Morgana", "Lewt"]

def find_section(range_start, range_end, num_sections, value):
    """
    Given a range [range_start, range_end], this function splits it into
    num_sections and returns which section (1-based index) the value falls into.

    :param range_start: The start of the range (inclusive).
    :param range_end: The end of the range (inclusive).
    :param num_sections: Number of sections to divide the range into.
    :param value: The value for which we want to determine the section index.
    :return: The section index (1-based) where the value fits.
    """
    # Handle edge cases, e.g., if value < range_start or value > range_end
    if value <= range_start:
        return 1
    if value >= range_end:
        return num_sections

    # Compute the size of each section
    section_size = (range_end - range_start) / num_sections

    # Determine which section the value falls into using integer division
    section_index = int((value - range_start) // section_size) + 1

    # Make sure the section index doesn't exceed num_sections
    if section_index > num_sections:
        section_index = num_sections

    return section_index

def get_name(name):
	if USE_GENDER_NAMES:
		if name in MALES:
			name = "Male"
		elif name in LADIES:
			name = "Female"
	else:
		if name in PROTAGONISTS:
			name = "Protagonist"
		elif name in LOVES:
			name = "Love"
	return name

data = []
chunks = []
chunk_totals = {}
name_to_film = {}
for i in range(CHUNKING + 1):
	for f in FILM_FRAME_TOTALS:
		chunk_totals[f + "---" + f"{i}"] = 0
for i in range(CHUNKING + 1):
	chunks.append({})

all_names = []

for file in csv_files:
	ft = os.path.splitext(os.path.basename(file))[0]
	with open(file) as f:
		lines = f.readlines()[2:]
		for line in lines:
			line = line.split(",")
			if line[5] != "\"N/A\"":
				# Att 3 is frame numbder / film's total frames (the film is the name of the current csv file)
				total = FILM_FRAME_TOTALS[os.path.basename(file)[:-4]]
				frame = int(line[0][1:][:-1])

				name = line[5][1:][:-1]
				if USE_GENERAL_NAMES:
					if name in RIVALS:
						name = "Rival"
					name = get_name(name)
				if IGNORE_MINOR_CHARACTERS:
					if USE_GENERAL_NAMES:
						if name not in ["Rival", "Protagonist", "Female", "Male", "Love"]:
							continue
					else:
						if name not in RIVALS:
							if name not in PROTAGONISTS:
								if name not in LOVES:
									continue
				time = find_section(0, total, CHUNKING, frame)
				chunk_flag = ft + "---" + f"{time}"
				if name not in all_names:
					all_names.append(name)
				# the chunk array time participant is a dictionary of the participants in that chunk
				chunk_totals[chunk_flag] += 1
				if name not in chunks[time]:
					chunks[time][name] = 0
				chunks[time][name] += 1

				if name not in name_to_film:
					name_to_film[name] = ft
				#data.append([frame, line[1], name, time])

for n in all_names:
	for i in range(1, CHUNKING+1):
		if n not in chunks[i]:
			chunks[i][n] = 0
			
for i in range(CHUNKING+1):
	for name in chunks[i]:
		gen = ""
		if name in RIVALS:
			gen = "Rival"
		else:
			gen = get_name(name)
			continue
		if abs(chunks[i][name]) < 0.00000001:
			data.append([i, name, 0, gen, name_to_film[name]])
		else:
			if USE_AVERAGE:
				data.append([i, name, chunks[i][name] / chunk_totals[name_to_film[name] + "---" + str(i)], gen, name_to_film[name]])
			else:
				data.append([i, name, chunks[i][name], gen, name_to_film[name]])

df = pd.DataFrame(data, columns=["Timeline", "Identity", "Value", "Role", "Title"])
df["Identity"] = df["Identity"].astype(str)
df["Identity"] = df["Identity"].apply(lambda x: x.split(" ")[0])

# Create a scatter plot of the data
fig = px.line(df, x="Timeline", y="Value", color="Identity", title=f"Temporal Presence in <i>Oklahoma!</i> (Jud) and <i>The Cliff</i> (Professor)<br><sub>Raw face count</sub>", markers=True, color_discrete_sequence=px.colors.qualitative.Vivid)
fig.update_layout(
    plot_bgcolor='rgb(240, 240, 230)',
	font=dict(family="Georgia", size=14),
    title_font=dict(size=22),
    legend_title_font=dict(size=16),
	margin=dict(t=80, b=40, l=40, r=40), height=460
)
df.to_csv("All_Chunked_Presence.csv")
fig.update_xaxes(showgrid=True, gridcolor='rgb(200, 190, 150)', zeroline=False)
fig.update_yaxes(showgrid=True, gridcolor='rgb(200, 190, 150)', zeroline=False)
fig.show()

