### GENERAL

In [91]:
# User plotly to visualize something

import plotly.express as px
import pandas as pd

# Load all csv files in csv/
import glob
import os

csv_files = glob.glob("csv/Gilda.csv")
# Convert all entries to structured data with attribute 0 as frame number, 1 as Face Box, and 5 as Identity
# Skip first two lines of each file
# Skip rows where Identity is N/A

USE_GENERAL_NAMES = False
USE_GENDER_NAMES = True
IGNORE_MINOR_CHARACTERS = True

FILM_FRAME_TOTALS = {
	"Cliff": 2097,
	"Our Village": 2377,
	"Diary of Nurse": 2297,
	"Sabrina": 2751,
	"The Man from Laramie": 2477,
	"Human Desire": 2200,
	"Chikamatsu Story": 2460,
	"Black River": 2552,
	"Duel in the Sun": 3207,
	"Gilda": 2652,
	"Jigoku Mon": 2151,
	"Kurutta Kajitsu": 2076,
	"Notorious": 2433,
	"Oklahoma": 3577,
	"Vikings": 2740,
	"Waves of Life": 2465
}

RIVALS = ["Keming", "Professor Yuan", "Haoru", "David", "Vic", "Carl", "Ishun", "Hitokiri Joe", "Natsuhisa", "Bo Kang", "Wataru", "Jud", "Alex Sebastian", "Ballin", "Einar", "Jesse"]
MALES = ["Zhanwu", "Fan Jun", "Changping", "Linus", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Devlin", "Johnny", "Eric", "Lewt"]
LADIES = ["Shuzhen", "Fang Qing", "Suhua", "Sabrina", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Alicia", "Gilda", "Morgana", "Pearl"]

PROTAGONISTS = ["Zhanwu", "Fang Qing", "Suhua", "Sabrina", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Alicia", "Johnny", "Eric", "Pearl"]
LOVES = ["Shuzhen", "Fan Jun", "Changping", "Linus", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Devlin", "Gilda", "Morgana", "Lewt"]

def get_name(name):
	if USE_GENDER_NAMES:
		if name in MALES:
			name = "Male"
		elif name in LADIES:
			name = "Female"
	else:
		if name in PROTAGONISTS:
			name = "Protagonist"
		elif name in LOVES:
			name = "Love"
	return name

data = []
for file in csv_files:
	with open(file) as f:
		lines = f.readlines()[2:]
		for line in lines:
			line = line.split(",")
			if line[5] != "\"N/A\"":
				# Att 3 is frame numbder / film's total frames (the film is the name of the current csv file)
				total = FILM_FRAME_TOTALS[os.path.basename(file)[:-4]]
				frame = int(line[0][1:][:-1])
				name = line[5][1:][:-1]
				if USE_GENERAL_NAMES:
					if name in RIVALS:
						name = "Rival"
					name = get_name(name)
				if IGNORE_MINOR_CHARACTERS:
					if USE_GENERAL_NAMES:
						if name not in ["Rival", "Protagonist", "Female", "Male", "Love"]:
							continue
					else:
						if name not in RIVALS:
							if name not in PROTAGONISTS:
								if name not in LOVES:
									continue
				data.append([frame, line[1], name, frame / total])

df = pd.DataFrame(data, columns=["Frame", "Face Box", "Identity", "Timeline"])
df["Identity"] = df["Identity"].astype(str)
df["Identity"] = df["Identity"].apply(lambda x: x.split(" ")[0])

# Create a scatter plot of the data
fig = px.scatter(df, height=360, x="Timeline", y="Identity", color="Identity", title="Face Recognition Results in Gilda")
fig.show()



### CHUNKED PRESENCE

In [93]:
CHUNKING = 8

# User plotly to visualize something

import plotly.express as px
import pandas as pd

# Load all csv files in csv/
import glob
import os

csv_files = glob.glob("csv/Gilda.csv")
# Convert all entries to structured data with attribute 0 as frame number, 1 as Face Box, and 5 as Identity
# Skip first two lines of each file
# Skip rows where Identity is N/A

USE_GENERAL_NAMES = True
USE_GENDER_NAMES = True

USE_AVERAGE = True

IGNORE_MINOR_CHARACTERS = True

FILM_FRAME_TOTALS = {
	"Cliff": 2097,
	"Our Village": 2377,
	"Diary of Nurse": 2297,
	"Sabrina": 2751,
	"The Man from Laramie": 2477,
	"Human Desire": 2200,
	"Chikamatsu Story": 2460,
	"Black River": 2552,
	"Duel in the Sun": 3207,
	"Gilda": 2652,
	"Jigoku Mon": 2151,
	"Kurutta Kajitsu": 2076,
	"Notorious": 2433,
	"Oklahoma": 3577,
	"Vikings": 2740,
	"Waves of Life": 2465
}

RIVALS = ["Keming", "Professor Yuan", "Haoru", "David", "Vic", "Carl", "Ishun", "Hitokiri Joe", "Natsuhisa", "Bo Kang", "Wataru", "Jud", "Alex Sebastian", "Ballin", "Einar", "Jesse"]
MALES = ["Zhanwu", "Fan Jun", "Changping", "Linus", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Devlin", "Johnny", "Eric", "Lewt"]
LADIES = ["Shuzhen", "Fang Qing", "Suhua", "Sabrina", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Alicia", "Gilda", "Morgana", "Pearl"]

PROTAGONISTS = ["Zhanwu", "Fang Qing", "Suhua", "Sabrina", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Alicia", "Johnny", "Eric", "Pearl"]
LOVES = ["Shuzhen", "Fan Jun", "Changping", "Linus", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Devlin", "Gilda", "Morgana", "Lewt"]

def find_section(range_start, range_end, num_sections, value):
    """
    Given a range [range_start, range_end], this function splits it into
    num_sections and returns which section (1-based index) the value falls into.

    :param range_start: The start of the range (inclusive).
    :param range_end: The end of the range (inclusive).
    :param num_sections: Number of sections to divide the range into.
    :param value: The value for which we want to determine the section index.
    :return: The section index (1-based) where the value fits.
    """
    # Handle edge cases, e.g., if value < range_start or value > range_end
    if value <= range_start:
        return 1
    if value >= range_end:
        return num_sections

    # Compute the size of each section
    section_size = (range_end - range_start) / num_sections

    # Determine which section the value falls into using integer division
    section_index = int((value - range_start) // section_size) + 1

    # Make sure the section index doesn't exceed num_sections
    if section_index > num_sections:
        section_index = num_sections

    return section_index

def get_name(name):
	if USE_GENDER_NAMES:
		if name in MALES:
			name = "Male"
		elif name in LADIES:
			name = "Female"
	else:
		if name in PROTAGONISTS:
			name = "Protagonist"
		elif name in LOVES:
			name = "Love"
	return name

data = []
chunks = []
chunk_totals = []
for i in range(CHUNKING + 1):
	chunk_totals.append(0)
for i in range(CHUNKING + 1):
	chunks.append({})

for file in csv_files:
	with open(file) as f:
		lines = f.readlines()[2:]
		for line in lines:
			line = line.split(",")
			if line[5] != "\"N/A\"":
				# Att 3 is frame numbder / film's total frames (the film is the name of the current csv file)
				total = FILM_FRAME_TOTALS[os.path.basename(file)[:-4]]
				frame = int(line[0][1:][:-1])
				name = line[5][1:][:-1]
				if USE_GENERAL_NAMES:
					if name in RIVALS:
						name = "Rival"
					name = get_name(name)
				if IGNORE_MINOR_CHARACTERS:
					if USE_GENERAL_NAMES:
						if name not in ["Rival", "Protagonist", "Female", "Male", "Love"]:
							continue
					else:
						if name not in RIVALS:
							if name not in PROTAGONISTS:
								if name not in LOVES:
									continue
				time = find_section(0, total, CHUNKING, frame)
				# the chunk array time participant is a dictionary of the participants in that chunk
				chunk_totals[time] += 1
				if name not in chunks[time]:
					chunks[time][name] = 0
				chunks[time][name] += 1
				#data.append([frame, line[1], name, time])

for i in range(CHUNKING):
	for name in chunks[i]:
		if USE_AVERAGE:
			data.append([i, name, chunks[i][name] / chunk_totals[i]])
		else:
			data.append([i, name, chunks[i][name]])

df = pd.DataFrame(data, columns=["Timeline", "Identity", "Value"])
df["Identity"] = df["Identity"].astype(str)
df["Identity"] = df["Identity"].apply(lambda x: x.split(" ")[0])

# Create a scatter plot of the data
fig = px.line(df, x="Timeline", y="Value", color="Identity", title="Chunked Average Temporal Presence in Gilda")
fig.show()



### CHUNKED FACE BOX SIZE (AVERAGE OR TOTAL)
[DEPRECATED]

In [87]:
CHUNKING = 8

# User plotly to visualize something

import plotly.express as px
import pandas as pd

# Load all csv files in csv/
import glob
import os

csv_files = glob.glob("csv/*.csv")
# Convert all entries to structured data with attribute 0 as frame number, 1 as Face Box, and 5 as Identity
# Skip first two lines of each file
# Skip rows where Identity is N/A

USE_AVERAGE = True

USE_GENERAL_NAMES = True
USE_GENDER_NAMES = False
IGNORE_MINOR_CHARACTERS = True

FILM_FRAME_TOTALS = {
	"Cliff": 2097,
	"Our Village": 2377,
	"Diary of Nurse": 2297,
	"Sabrina": 2751,
	"The Man from Laramie": 2477,
	"Human Desire": 2200,
	"Chikamatsu Story": 2460,
	"Black River": 2552,
	"Duel in the Sun": 3207,
	"Gilda": 2652,
	"Jigoku Mon": 2151,
	"Kurutta Kajitsu": 2076,
	"Notorious": 2433,
	"Oklahoma": 3577,
	"Vikings": 2740,
	"Waves of Life": 2465
}

RIVALS = ["Keming", "Professor Yuan", "Haoru", "David", "Vic", "Carl", "Ishun", "Hitokiri Joe", "Natsuhisa", "Bo Kang", "Wataru", "Jud", "Alex Sebastian", "Ballin", "Einar", "Jesse"]
MALES = ["Zhanwu", "Fan Jun", "Changping", "Linus", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Devlin", "Johnny", "Eric", "Lewt"]
LADIES = ["Shuzhen", "Fang Qing", "Suhua", "Sabrina", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Alicia", "Gilda", "Morgana", "Pearl"]

PROTAGONISTS = ["Zhanwu", "Fang Qing", "Suhua", "Sabrina", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Alicia", "Johnny", "Eric", "Pearl"]
LOVES = ["Shuzhen", "Fan Jun", "Changping", "Linus", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Devlin", "Gilda", "Morgana", "Lewt"]

def find_section(range_start, range_end, num_sections, value):
    """
    Given a range [range_start, range_end], this function splits it into
    num_sections and returns which section (1-based index) the value falls into.

    :param range_start: The start of the range (inclusive).
    :param range_end: The end of the range (inclusive).
    :param num_sections: Number of sections to divide the range into.
    :param value: The value for which we want to determine the section index.
    :return: The section index (1-based) where the value fits.
    """
    # Handle edge cases, e.g., if value < range_start or value > range_end
    if value <= range_start:
        return 1
    if value >= range_end:
        return num_sections

    # Compute the size of each section
    section_size = (range_end - range_start) / num_sections

    # Determine which section the value falls into using integer division
    section_index = int((value - range_start) // section_size) + 1

    # Make sure the section index doesn't exceed num_sections
    if section_index > num_sections:
        section_index = num_sections

    return section_index

def get_name(name):
	if USE_GENDER_NAMES:
		if name in MALES:
			name = "Male"
		elif name in LADIES:
			name = "Female"
	else:
		if name in PROTAGONISTS:
			name = "Protagonist"
		elif name in LOVES:
			name = "Love"
	return name

data = []
chunks = []
chunkcounts = []
for i in range(CHUNKING + 1):
	chunks.append({})
	chunkcounts.append({})

for file in csv_files:
	with open(file) as f:
		lines = f.readlines()[2:]
		for line in lines:
			line = line.split(",")
			if line[5] != "\"N/A\"":
				# Att 3 is frame numbder / film's total frames (the film is the name of the current csv file)
				total = FILM_FRAME_TOTALS[os.path.basename(file)[:-4]]
				frame = int(line[0][1:][:-1])
				name = line[5][1:][:-1]
				face_box_text = line[1][1:][:-1][1:][:-1]
				face_box = [float(x) for x in face_box_text.split("\， ")]
				face_box_height = face_box[3]# - face_box[1]
				face_box_width = face_box[2]# - face_box[0]
				if USE_GENERAL_NAMES:
					if name in RIVALS:
						name = "Rival"
					name = get_name(name)
				if IGNORE_MINOR_CHARACTERS:
					if USE_GENERAL_NAMES:
						if name not in ["Rival", "Protagonist", "Female", "Male", "Love"]:
							continue
					else:
						if name not in RIVALS:
							if name not in PROTAGONISTS:
								if name not in LOVES:
									continue
				time = find_section(0, total, CHUNKING, frame)
				# the chunk array time participant is a dictionary of the participants in that chunk
				if name not in chunks[time]:
					chunks[time][name] = 0
					chunkcounts[time][name] = 0
				chunks[time][name] += face_box_height * face_box_width
				chunkcounts[time][name] += 1
				#data.append([frame, line[1], name, time])

for i in range(CHUNKING):
	for name in chunks[i]:
		if USE_AVERAGE:
			chunks[i][name] /= chunkcounts[i][name]
		data.append([i, name, chunks[i][name]])

df = pd.DataFrame(data, columns=["Timeline", "Identity", "Value"])
df["Identity"] = df["Identity"].astype(str)
df["Identity"] = df["Identity"].apply(lambda x: x.split(" ")[0])

# Create a scatter plot of the data
fig = px.line(df, x="Timeline", y="Value", color="Identity", title="Face Recognition")
fig.show()

### ACTION UNITS
[DEPRECATED]

In [30]:
# User plotly to visualize something

AU = 2
AU_R = 1
AU_E = 18

E_TH = 0.5

import plotly.express as px
import pandas as pd

# Load all csv files in csv/
import glob
import os

csv_files = glob.glob("csv/*.csv")
# Convert all entries to structured data with attribute 0 as frame number, 1 as Face Box, and 5 as Identity
# Skip first two lines of each file
# Skip rows where Identity is N/A

USE_GENERAL_NAMES = True
USE_GENDER_NAMES = True
IGNORE_MINOR_CHARACTERS = True

FILM_FRAME_TOTALS = {
	"Cliff": 2097,
	"Our Village": 2377,
	"Diary of Nurse": 2297,
	"Sabrina": 2751,
	"The Man from Laramie": 2477,
	"Human Desire": 2200,
}

FILMS_WITH_EMOTION = FILM_FRAME_TOTALS.keys()

RIVALS = ["Keming", "Professor Yuan", "Haoru", "David", "Vic", "Carl"]
MALES = ["Zhanwu", "Fan Jun", "Changping", "Linus", "Will", "Jeff"]
LADIES = ["Shuzhen", "Fang Qing", "Suhua", "Sabrina", "Barbara", "Vicki"]

PROTAGONISTS = ["Zhanwu", "Fang Qing", "Suhua", "Sabrina", "Will", "Jeff"]
LOVES = ["Shuzhen", "Fan Jun", "Changping", "Linus", "Barbara", "Vicki"]

def get_name(name):
	if USE_GENDER_NAMES:
		if name in MALES:
			name = "Male"
		elif name in LADIES:
			name = "Female"
	else:
		if name in PROTAGONISTS:
			name = "Fan Jun"
		elif name in LOVES:
			name = "Love"
	return name

data = []
for file in csv_files:
	if file[4:-4] not in FILMS_WITH_EMOTION:
		continue
	with open(file) as f:
		emotion_file = "openface_cleaned/" + os.path.basename(file)[:-4] + ".csv"
		lines = f.readlines()[2:]
		for line in lines:
			line = line.split(",")
			if line[5] != "\"N/A\"":
				emotion_row = ""
				path = line[4][1:][:-1]
				with open(emotion_file) as ef:
					ef_lines = ef.readlines()
					for ef_line in ef_lines:
						ef_line = ef_line.split(",")
						if ef_line[0][:-4] == path[6:]:
							emotion_row = ef_line
							break
				if emotion_row == "":
					continue

				log1 = int(int(AU) + int(AU_R) + 1)
				log2 = int(int(AU) + int(AU_E) + 1)
				r = emotion_row[log1]
				e = float(emotion_row[log2])

				if e < E_TH:
					continue

				# Att 3 is frame numbder / film's total frames (the film is the name of the current csv file)
				total = FILM_FRAME_TOTALS[os.path.basename(file)[:-4]]
				frame = int(line[0][1:][:-1])
				name = line[5][1:][:-1]
				if USE_GENERAL_NAMES:
					if name in RIVALS:
						name = "Rival"
					name = get_name(name)
				if IGNORE_MINOR_CHARACTERS:
					if USE_GENERAL_NAMES:
						if name not in ["Rival", "Protagonist", "Female", "Male", "Love"]:
							continue
					else:
						if name not in RIVALS:
							if name not in PROTAGONISTS:
								if name not in LOVES:
									continue
				data.append([frame, r, name, frame / total, os.path.basename(file)[:-4]])

df = pd.DataFrame(data, columns=["Frame", "Face Box", "Identity", "Timeline", "Film"])
df["Identity"] = df["Identity"].astype(str)
df["Identity"] = df["Identity"].apply(lambda x: x.split(" ")[0])

# Create a scatter plot of the data
fig = px.scatter(df, x="Timeline", y="Face Box", color="Identity", hover_data=["Film"], title="Face Recognition")
fig.show()

In [None]:
CHUNKING = 7

AU = 0
AUS = ["01", "02", "04", "05", "06", "07", "09", "10", "12", "14", "15", "17", "20", "23", "25", "26", "28", "45"]
print(len(AUS))

ALL_AU = True # Use all AUs in the same graph
COUNTING = True # Count number or intensity

AU_R = 1
AU_E = 18

E_TH = 0.5 #  Presence threshold (0.5 for present and -0.5 for all)
R_TH = 3 # Itensity threshold (1-5)

# User plotly to visualize something

import plotly.express as px
import pandas as pd

# Json
import json

# Load all csv files in csv/
import glob
import os

csv_files = glob.glob("csv/*.csv")
# Convert all entries to structured data with attribute 0 as frame number, 1 as Face Box, and 5 as Identity
# Skip first two lines of each file
# Skip rows where Identity is N/A

USE_AVERAGE = True

USE_GENERAL_NAMES = True
USE_GENDER_NAMES = True
IGNORE_MINOR_CHARACTERS = True

FILM_FRAME_TOTALS = {
	"Cliff": 2097,
	"Our Village": 2377,
	"Diary of Nurse": 2297,
	"Sabrina": 2751,
	"The Man from Laramie": 2477,
	"Human Desire": 2200,
}

FILMS_WITH_EMOTION = FILM_FRAME_TOTALS.keys()

RIVALS = ["Keming", "Professor Yuan", "Haoru", "David", "Vic", "Carl"]
MALES = ["Zhanwu", "Fan Jun", "Changping", "Linus", "Will", "Jeff"]
LADIES = ["Shuzhen", "Fang Qing", "Suhua", "Sabrina", "Barbara", "Vicki"]

PROTAGONISTS = ["Zhanwu", "Fang Qing", "Suhua", "Sabrina", "Will", "Jeff"]
LOVES = ["Shuzhen", "Fan Jun", "Changping", "Linus", "Barbara", "Vicki"]

def find_section(range_start, range_end, num_sections, value):
    """
    Given a range [range_start, range_end], this function splits it into
    num_sections and returns which section (1-based index) the value falls into.

    :param range_start: The start of the range (inclusive).
    :param range_end: The end of the range (inclusive).
    :param num_sections: Number of sections to divide the range into.
    :param value: The value for which we want to determine the section index.
    :return: The section index (1-based) where the value fits.
    """
    # Handle edge cases, e.g., if value < range_start or value > range_end
    if value <= range_start:
        return 1
    if value >= range_end:
        return num_sections

    # Compute the size of each section
    section_size = (range_end - range_start) / num_sections

    # Determine which section the value falls into using integer division
    section_index = int((value - range_start) // section_size) + 1

    # Make sure the section index doesn't exceed num_sections
    if section_index > num_sections:
        section_index = num_sections

    return section_index

def get_name(name):
	if USE_GENDER_NAMES:
		if name in MALES:
			name = "Male"
		elif name in LADIES:
			name = "Female"
	else:
		if name in PROTAGONISTS:
			name = "Protagonist"
		elif name in LOVES:
			name = "Love"
	return name

data = []
chunks = []
chunkcounts = []
for i in range(CHUNKING + 1):
	chunks.append({})
	chunkcounts.append({})

for file in csv_files:
	if file[4:-4] not in FILMS_WITH_EMOTION:
		continue
	with open(file) as f:
		emotion_file = "openface_cleaned/" + os.path.basename(file)[:-4] + ".csv"
		lines = f.readlines()[2:]
		for line in lines:
			line = line.split(",")
			if line[5] != "\"N/A\"":
				# Att 3 is frame numbder / film's total frames (the film is the name of the current csv file)
				total = FILM_FRAME_TOTALS[os.path.basename(file)[:-4]]
				frame = int(line[0][1:][:-1])
				name = line[5][1:][:-1]
				path = line[4][1:][:-1]

				emotion_row = ""
				with open(emotion_file) as ef:
					ef_lines = ef.readlines()
					for ef_line in ef_lines:
						ef_line = ef_line.split(",")
						if ef_line[0][:-4] == path[6:]:
							emotion_row = ef_line
							break
				if emotion_row == "":
					continue

				log1 = int(int(AU) + int(AU_R) + 1)
				log2 = int(int(AU) + int(AU_E) + 1)


				r = float(emotion_row[log1])
				e = float(emotion_row[log2])


				face_box_text = line[1][1:][:-1][1:][:-1]
				face_box = [float(x) for x in face_box_text.split("\， ")]
				face_box_height = face_box[3]# - face_box[1]
				face_box_width = face_box[2]# - face_box[0]
				if USE_GENERAL_NAMES:
					if name in RIVALS:
						name = "Rival"
					name = get_name(name)
				if IGNORE_MINOR_CHARACTERS:
					if USE_GENERAL_NAMES:
						if name not in ["Rival", "Protagonist", "Female", "Male", "Love"]:
							continue
					else:
						if name not in RIVALS:
							if name not in PROTAGONISTS:
								if name not in LOVES:
									continue
				time = find_section(0, total, CHUNKING, frame)
				base_name = name
				if ALL_AU:
					name = ""
					for i in range(1, 18):
						r_val = float(emotion_row[i])
						e_val = float(emotion_row[i + 17])
						if e_val > E_TH and r_val > R_TH:
							name = base_name + "-AU" + AUS[i - 1]
							if name not in chunks[time]:
								chunks[time][name] = 0
							if base_name not in chunkcounts[time]:
								chunkcounts[time][base_name] = 0
							if COUNTING:
								chunks[time][name] += 1
							else:
								chunks[time][name] += r_val
							chunkcounts[time][base_name] += 1
				else:
					name = name
					if e > E_TH and r > R_TH:
						if name not in chunks[time]:
							chunks[time][name] = 0
						if base_name not in chunkcounts[time]:
							chunkcounts[time][base_name] = 0
						if COUNTING:
							chunks[time][name] += 1
						else:
							chunks[time][name] += r
						chunkcounts[time][base_name] += 1
				# the chunk array time participant is a dictionary of the participants in that chunk

				#data.append([frame, line[1], name, time])

for i in range(CHUNKING):
	for name in chunks[i]:
		base_name = name.split("-")[0]
		if USE_AVERAGE:
			if chunkcounts[i][base_name] == 0:
				chunks[i][name] = 0
			else:
				chunks[i][name] /= chunkcounts[i][base_name]
		data.append([i, name, chunks[i][name]])

df = pd.DataFrame(data, columns=["Timeline", "Identity", "Value"])
df["Identity"] = df["Identity"].astype(str)
df["Identity"] = df["Identity"].apply(lambda x: x.split(" ")[0])

# Create a scatter plot of the data
fig = px.line(df, x="Timeline", y="Value", color="Identity", title="Face Recognition")
fig.show()

18
Jigoku Mon
Waves of Life
Duel in the Sun
Black River
Notorious
Oklahoma
Kurutta Kajitsu
Chikamatsu Story
Gilda
Vikings


### CHUNKED PRESENCE WITH FILM MEANS

In [85]:
CHUNKING = 8

# User plotly to visualize something

import plotly.express as px
import pandas as pd

# Load all csv files in csv/
import glob
import os

csv_files = glob.glob("csv/*.csv")

USE_GENERAL_NAMES = True # Keep true
USE_GENDER_NAMES = False

USE_FILM_AVERAGE = True # Better keep true

IGNORE_MINOR_CHARACTERS = True

FILM_FRAME_TOTALS = {
	"Cliff": 2097,
	"Our Village": 2377,
	"Diary of Nurse": 2297,
	"Sabrina": 2751,
	"The Man from Laramie": 2477,
	"Human Desire": 2200,
	"Chikamatsu Story": 2460,
	"Black River": 2552,
	"Duel in the Sun": 3207,
	"Gilda": 2652,
	"Jigoku Mon": 2151,
	"Kurutta Kajitsu": 2076,
	"Notorious": 2433,
	"Oklahoma": 3577,
	"Vikings": 2740,
	"Waves of Life": 2465
}

RIVALS = ["Keming", "Professor Yuan", "Haoru", "David", "Vic", "Carl", "Ishun", "Hitokiri Joe", "Natsuhisa", "Bo Kang", "Wataru", "Jud", "Alex Sebastian", "Ballin", "Einar", "Jesse"]
MALES = ["Zhanwu", "Fan Jun", "Changping", "Linus", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Devlin", "Johnny", "Eric", "Lewt"]
LADIES = ["Shuzhen", "Fang Qing", "Suhua", "Sabrina", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Alicia", "Gilda", "Morgana", "Pearl"]

PROTAGONISTS = ["Zhanwu", "Fang Qing", "Suhua", "Sabrina", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Alicia", "Johnny", "Eric", "Pearl"]
LOVES = ["Shuzhen", "Fan Jun", "Changping", "Linus", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Devlin", "Gilda", "Morgana", "Lewt"]

def find_section(range_start, range_end, num_sections, value):
    """
    Given a range [range_start, range_end], this function splits it into
    num_sections and returns which section (1-based index) the value falls into.

    :param range_start: The start of the range (inclusive).
    :param range_end: The end of the range (inclusive).
    :param num_sections: Number of sections to divide the range into.
    :param value: The value for which we want to determine the section index.
    :return: The section index (1-based) where the value fits.
    """
    # Handle edge cases, e.g., if value < range_start or value > range_end
    if value <= range_start:
        return 1
    if value >= range_end:
        return num_sections

    # Compute the size of each section
    section_size = (range_end - range_start) / num_sections

    # Determine which section the value falls into using integer division
    section_index = int((value - range_start) // section_size) + 1

    # Make sure the section index doesn't exceed num_sections
    if section_index > num_sections:
        section_index = num_sections

    return section_index

def get_name(name):
	if USE_GENDER_NAMES:
		if name in MALES:
			name = "Male"
		elif name in LADIES:
			name = "Female"
	else:
		if name in PROTAGONISTS:
			name = "Protagonist"
		elif name in LOVES:
			name = "Love"
	return name

data = []
chunks = []
chunk_totals = {}
chunk_all_totals = []
for i in range(CHUNKING + 1):
	chunk_all_totals.append(0)
for i in range(CHUNKING + 1):
	chunks.append({})

for file in csv_files:
	film = file[4:-4]
	with open(file) as f:
		lines = f.readlines()[2:]
		for line in lines:
			line = line.split(",")
			if line[5] != "\"N/A\"":
				# Att 3 is frame numbder / film's total frames (the film is the name of the current csv file)
				total = FILM_FRAME_TOTALS[os.path.basename(file)[:-4]]
				frame = int(line[0][1:][:-1])
				name = line[5][1:][:-1]
				if USE_GENERAL_NAMES:
					if name in RIVALS:
						name = "Rival"
					name = get_name(name)
				if IGNORE_MINOR_CHARACTERS:
					if USE_GENERAL_NAMES:
						if name not in ["Rival", "Protagonist", "Female", "Male", "Love"]:
							continue
					else:
						if name not in RIVALS:
							if name not in PROTAGONISTS:
								if name not in LOVES:
									continue

				time = find_section(0, total, CHUNKING, frame)

				name = film + ":::" + name
				chunk_all_totals[time] += 1

				# the chunk array time participant is a dictionary of the participants in that chunk
				chunk_key = film + "---" + f"{time}"
				if chunk_key not in chunk_totals:
					chunk_totals[chunk_key] = 0

				chunk_totals[chunk_key] += 1
				if name not in chunks[time]:
					chunks[time][name] = 0
				chunks[time][name] += 1
				#data.append([frame, line[1], name, time])

for i in range(CHUNKING):
	names = {}
	for name in chunks[i]:
		# print(name)
		
		films = name.split(":::")
		real_name = films[1]
		real_film = films[0]

		total = chunk_totals[real_film + "---" + f"{i}"]
		character_in_chunk = chunks[i][name]
		avg = character_in_chunk / total
		if real_name not in names:
			names[real_name] = 0
		if USE_FILM_AVERAGE:
			names[real_name] += avg
		else:
			names[real_name] += character_in_chunk

	for name in names:
		chunk_total = names[name]
		real_avg = chunk_total / len(FILM_FRAME_TOTALS)

		data.append([i, name, real_avg])

df = pd.DataFrame(data, columns=["Timeline", "Identity", "Value"])
df["Identity"] = df["Identity"].astype(str)
df["Identity"] = df["Identity"].apply(lambda x: x.split(" ")[0])

# Create a scatter plot of the data
fig = px.line(df, x="Timeline", y="Value", color="Identity", title="Face Recognition")
fig.show()

### CHUNKED SIZE WITH FILM MEANS

In [84]:
CHUNKING = 8

# User plotly to visualize something

import plotly.express as px
import pandas as pd

# Load all csv files in csv/
import glob
import os

csv_files = glob.glob("csv/*.csv")

USE_GENERAL_NAMES = True # Keep true
USE_GENDER_NAMES = False

USE_INDIVIDUAL_AVERAGE_NOT_THREE = True # Keep true

USE_FILM_AVERAGE = True # Keep true

IGNORE_MINOR_CHARACTERS = True

FILM_FRAME_TOTALS = {
	"Cliff": 2097,
	"Our Village": 2377,
	"Diary of Nurse": 2297,
	"Sabrina": 2751,
	"The Man from Laramie": 2477,
	"Human Desire": 2200,
	"Chikamatsu Story": 2460,
	"Black River": 2552,
	"Duel in the Sun": 3207,
	"Gilda": 2652,
	"Jigoku Mon": 2151,
	"Kurutta Kajitsu": 2076,
	"Notorious": 2433,
	"Oklahoma": 3577,
	"Vikings": 2740,
	"Waves of Life": 2465
}

RIVALS = ["Keming", "Professor Yuan", "Haoru", "David", "Vic", "Carl", "Ishun", "Hitokiri Joe", "Natsuhisa", "Bo Kang", "Wataru", "Jud", "Alex Sebastian", "Ballin", "Einar", "Jesse"]
MALES = ["Zhanwu", "Fan Jun", "Changping", "Linus", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Devlin", "Johnny", "Eric", "Lewt"]
LADIES = ["Shuzhen", "Fang Qing", "Suhua", "Sabrina", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Alicia", "Gilda", "Morgana", "Pearl"]

PROTAGONISTS = ["Zhanwu", "Fang Qing", "Suhua", "Sabrina", "Will", "Jeff", "Mohei", "Nishida", "Haruji", "Jin Zhang", "Moritoo", "Curly", "Alicia", "Johnny", "Eric", "Pearl"]
LOVES = ["Shuzhen", "Fan Jun", "Changping", "Linus", "Barbara", "Vicki", "Osan", "Shizuko", "Eri", "Ye Suping", "Kesa", "Laurey", "Devlin", "Gilda", "Morgana", "Lewt"]

def find_section(range_start, range_end, num_sections, value):
    """
    Given a range [range_start, range_end], this function splits it into
    num_sections and returns which section (1-based index) the value falls into.

    :param range_start: The start of the range (inclusive).
    :param range_end: The end of the range (inclusive).
    :param num_sections: Number of sections to divide the range into.
    :param value: The value for which we want to determine the section index.
    :return: The section index (1-based) where the value fits.
    """
    # Handle edge cases, e.g., if value < range_start or value > range_end
    if value <= range_start:
        return 1
    if value >= range_end:
        return num_sections

    # Compute the size of each section
    section_size = (range_end - range_start) / num_sections

    # Determine which section the value falls into using integer division
    section_index = int((value - range_start) // section_size) + 1

    # Make sure the section index doesn't exceed num_sections
    if section_index > num_sections:
        section_index = num_sections

    return section_index

def get_name(name):
	if USE_GENDER_NAMES:
		if name in MALES:
			name = "Male"
		elif name in LADIES:
			name = "Female"
	else:
		if name in PROTAGONISTS:
			name = "Protagonist"
		elif name in LOVES:
			name = "Love"
	return name

data = []
chunks = []
chunk_totals = {}
chunk_all_totals = []
for i in range(CHUNKING + 1):
	chunk_all_totals.append(0)
for i in range(CHUNKING + 1):
	chunks.append({})

for file in csv_files:
	film = file[4:-4]
	with open(file) as f:
		lines = f.readlines()[2:]
		for line in lines:
			line = line.split(",")
			if line[5] != "\"N/A\"":
				# Att 3 is frame numbder / film's total frames (the film is the name of the current csv file)
				total = FILM_FRAME_TOTALS[os.path.basename(file)[:-4]]
				frame = int(line[0][1:][:-1])
				name = line[5][1:][:-1]
				face_box_text = line[1][1:][:-1][1:][:-1]
				face_box = [float(x) for x in face_box_text.split("\， ")]
				face_box_width = face_box[2]
				face_box_height = face_box[3]
				if USE_GENERAL_NAMES:
					if name in RIVALS:
						name = "Rival"
					name = get_name(name)
				if IGNORE_MINOR_CHARACTERS:
					if USE_GENERAL_NAMES:
						if name not in ["Rival", "Protagonist", "Female", "Male", "Love"]:
							continue
					else:
						if name not in RIVALS:
							if name not in PROTAGONISTS:
								if name not in LOVES:
									continue

				time = find_section(0, total, CHUNKING, frame)

				name = film + ":::" + name
				chunk_all_totals[time] += 1

				# the chunk array time participant is a dictionary of the participants in that chunk
				if USE_INDIVIDUAL_AVERAGE_NOT_THREE:
					chunk_key = name + "---" + f"{time}"
				else:
					chunk_key = film + "---" + f"{time}"
				if chunk_key not in chunk_totals:
					chunk_totals[chunk_key] = 0

				chunk_totals[chunk_key] += 1
				if name not in chunks[time]:
					chunks[time][name] = 0
				chunks[time][name] += face_box_height * face_box_width
				#data.append([frame, line[1], name, time])

for i in range(CHUNKING):
	names = {}
	for name in chunks[i]:
		
		
		films = name.split(":::")
		real_name = films[1]
		real_film = films[0]

		if USE_INDIVIDUAL_AVERAGE_NOT_THREE:
			total = chunk_totals[name + "---" + f"{i}"]
		else:
			total = chunk_totals[real_film + "---" + f"{i}"]
		character_in_chunk = chunks[i][name]
		avg = character_in_chunk / total
		if real_name not in names:
			names[real_name] = 0
		if USE_FILM_AVERAGE:
			names[real_name] += avg
		else:
			names[real_name] += character_in_chunk

	for name in names:
		chunk_total = names[name]
		real_avg = chunk_total / len(FILM_FRAME_TOTALS)

		data.append([i, name, real_avg])

df = pd.DataFrame(data, columns=["Timeline", "Identity", "Value"])
df["Identity"] = df["Identity"].astype(str)
df["Identity"] = df["Identity"].apply(lambda x: x.split(" ")[0])

# Create a scatter plot of the data
fig = px.line(df, x="Timeline", y="Value", color="Identity", title="Face Recognition")
fig.show()