In [176]:
import pandas as pd
import os
import glob

In [None]:
#Quick Rename Files
# glob.glob('./Unity Data/*/*.csv')
# for file_path in glob.glob('./Unity Data/*/*.csv'):
#     file_name = os.path.basename(file_path)
#     new_file_name = file_name.split('_')[-1].split('.')[0].upper() + '_' + file_name.split('_')[0].upper() + '.csv'
#     os.rename(file_path, os.path.join(os.path.dirname(file_path), new_file_name))

# Preperation

* Removing Entry and exit from table entry if time difference between them is less than 1/2 second.
* Removing all the entries for books which are placed withing first second.

In [299]:
groups = os.listdir("Unity Data")
data_file_paths = {group : glob.glob(f"Unity Data/{group}/*") for group in groups }

In [184]:
def load_and_clean_data(file_path):
    """Loads and cleans the data from the given file path."""
    df = pd.read_csv(file_path)
    df.drop(columns=["Score", "Penalty", "ID", "RollNumber", "Group"], inplace=True)
    df.drop(index=0, inplace=True)
    return df

def filter_simulation_data(df):
    """Filters out the simulation data from the DataFrame."""
    mask_start = df[df["EventType"] == "SimulationStarted"]
    mask_end = df[df["EventType"] == "SimulationEnded"]
    if mask_end.empty:
        mask_end = df.iloc[-1, :]
    return df.loc[mask_start.index[-1]:mask_end.index[0]]

def remove_fake_sitting_indications(df):
    """Removes fake sitting indications from the DataFrame."""
    entry_under_table = df[df["EventType"] == "EntryUnderTable"]
    exit_under_table = df[df["EventType"] == "ExitUnderTable"]
    table_interaction = pd.concat([entry_under_table, exit_under_table])
    table_interaction.sort_values(by="Time", inplace=True)
    minimum_time_with_table = 500  # milliseconds
    for i in range(0, len(table_interaction), 2):
        if table_interaction.iloc[i + 1]["Time"] - table_interaction.iloc[i]["Time"] < minimum_time_with_table:
            df.drop(index=[table_interaction.index[i], table_interaction.index[i + 1]], inplace=True)
    return df

def remove_initial_books_placement(df):
    """Removes initial books placement from the DataFrame."""
    filter_time = df[df["EventType"] == "SimulationStarted"]["Time"].values[0] + 1000  # milliseconds
    entry_books = df[df["EventType"] == "BookPlaced"]
    rows_to_remove = entry_books[entry_books["Time"] < filter_time]
    df.drop(index=rows_to_remove.index, inplace=True)
    return df

def get_cleaned_data(file_path, groups):
    """Main function to perform data pre-cleaning."""
    df = load_and_clean_data(file_path)
    df = filter_simulation_data(df)
    df = remove_fake_sitting_indications(df)
    df = remove_initial_books_placement(df)
    return df


# Task Specific Analysis


### Book placement (Group 1, 2)

In [232]:
def get_books_placed_stats(df, id, group):
    earthquake_start_index = df[df["EventType"] == "EarthquakeStart"].index[0]
    earthquake_end_index = df[df["EventType"] == "EarthquakeEnd"].index[-1]
    before_earthquake_data = df.loc[:earthquake_start_index]
    after_earthquake_data = df.loc[earthquake_end_index:]
    during_earthquake_data = df.loc[earthquake_start_index:earthquake_end_index]

    # Calcaulate number of books placed before, during and after earthquake
    books_before_earthquake = before_earthquake_data[before_earthquake_data["EventType"] == "BookPlaced"].shape[0]
    books_during_earthquake = during_earthquake_data[during_earthquake_data["EventType"] == "BookPlaced"].shape[0]
    books_after_earthquake = after_earthquake_data[after_earthquake_data["EventType"] == "BookPlaced"].shape[0]

    return [id, group, books_before_earthquake, books_during_earthquake, books_after_earthquake]

def get_books_placed_stats_for_group(group, data_file_paths):
    """Returns the number of books placed before, during and after earthquake for the given group."""
    books_placed_stats = []
    for file_path in data_file_paths[group]:
        df = get_cleaned_data(file_path, groups)
        books_placed_stats.append(get_books_placed_stats(df, os.path.basename(file_path).split(".")[0], group))
    books_placed_stats = pd.DataFrame(books_placed_stats, columns=["ID", "Group", "BooksPlacedBeforeEarthquake", "BooksPlacedDuringEarthquake", "BooksPlacedAfterEarthquake"])
    return books_placed_stats

def get_average_book_placed_stats_for_all_groups(data_file_paths):
    """Returns the average number of books placed before, during and after earthquake for all groups."""
    books_placed_stats = []
    for group in data_file_paths:
        books_placed_stats.append(get_books_placed_stats_for_group(group, data_file_paths))
    books_placed_stats = pd.concat(books_placed_stats)
    return books_placed_stats
tt = get_average_book_placed_stats_for_all_groups(data_file_paths)

In [233]:
t = tt.drop(columns='ID')
t = t.groupby('Group').mean()
t

Unnamed: 0_level_0,BooksPlacedBeforeEarthquake,BooksPlacedDuringEarthquake,BooksPlacedAfterEarthquake
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Group 1,2.65,1.4,4.65
Group 2,3.35,2.5,3.8
Group 3,0.5,0.0,0.4375
Group 4,0.5,0.0,0.142857


### Item Observation Task (Group 3, 4)

In [234]:
def get_items_observed_stats(df, id, group):
    earthquake_start_index = df[df["EventType"] == "EarthquakeStart"].index[0]
    earthquake_end_index = df[df["EventType"] == "EarthquakeEnd"].index[-1]
    before_earthquake_data = df.loc[:earthquake_start_index]
    after_earthquake_data = df.loc[earthquake_end_index:]
    during_earthquake_data = df.loc[earthquake_start_index:earthquake_end_index]

    # Calcaulate number of books placed before, during and after earthquake
    items_observed_before_earthquake = before_earthquake_data[before_earthquake_data["EventType"] == "ItemObserved"].shape[0]
    items_observed_during_earthquake = during_earthquake_data[during_earthquake_data["EventType"] == "ItemObserved"].shape[0]
    items_observed_after_earthquake = after_earthquake_data[after_earthquake_data["EventType"] == "ItemObserved"].shape[0]

    return [id, group, items_observed_before_earthquake, items_observed_during_earthquake, items_observed_after_earthquake]

def get_items_observed_stats_for_group(group, data_file_paths):
    """Returns the number of books placed before, during and after earthquake for the given group."""
    items_observed_stats = []
    for file_path in data_file_paths[group]:
        df = get_cleaned_data(file_path, groups)
        items_observed_stats.append(get_items_observed_stats(df, os.path.basename(file_path).split(".")[0], group))
    items_observed_stats = pd.DataFrame(items_observed_stats, columns=["ID", "Group", "ItemsObservedBeforeEarthquake", "ItemsObservedDuringEarthquake", "ItemsObservedAfterEarthquake"])
    return items_observed_stats

def get_average_items_observed_stats_for_all_groups(data_file_paths):
    """Returns the average number of books placed before, during and after earthquake for all groups."""
    items_observed_stats = []
    for group in data_file_paths:
        items_observed_stats.append(get_items_observed_stats_for_group(group, data_file_paths))
    items_observed_stats = pd.concat(items_observed_stats)
    return items_observed_stats
tt = get_average_items_observed_stats_for_all_groups(data_file_paths)

In [235]:
t = tt.drop(columns='ID')
t = t.groupby('Group').mean()
t

Unnamed: 0_level_0,ItemsObservedBeforeEarthquake,ItemsObservedDuringEarthquake,ItemsObservedAfterEarthquake
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Group 1,0.0,0.0,0.0
Group 2,0.0,0.0,0.0
Group 3,2.75,0.625,4.8125
Group 4,2.571429,1.5,3.785714


# Participant Actions


### Number of items picked

In [None]:
def get_items_picked_stats(df, id, group):
    """Calculates the number of items picked before, during, and after an earthquake."""
    earthquake_start_index = df[df["EventType"] == "EarthquakeStart"].index[0]
    earthquake_end_index = df[df["EventType"] == "EarthquakeEnd"].index[-1]
    before_earthquake_data = df.loc[:earthquake_start_index]
    after_earthquake_data = df.loc[earthquake_end_index:]
    during_earthquake_data = df.loc[earthquake_start_index:earthquake_end_index]

    items_picked_before_earthquake = before_earthquake_data[before_earthquake_data["EventType"] == "ItemPicked"].shape[0]
    items_picked_during_earthquake = during_earthquake_data[during_earthquake_data["EventType"] == "ItemPicked"].shape[0]
    items_picked_after_earthquake = after_earthquake_data[after_earthquake_data["EventType"] == "ItemPicked"].shape[0]

    return [id, group, items_picked_before_earthquake, items_picked_during_earthquake, items_picked_after_earthquake]

def get_items_picked_stats_for_group(group, data_file_paths, get_cleaned_data, groups):
    """Returns the number of items picked before, during and after earthquake for the given group."""
    items_picked_stats =[]
    for file_path in data_file_paths[group]:
        df = get_cleaned_data(file_path, groups)
        items_picked_stats.append(get_items_picked_stats(df, os.path.basename(file_path).split(".")[0], group))
    items_picked_stats = pd.DataFrame(items_picked_stats, columns=["ID", "Group", "ItemsPickedBeforeEarthquake", "ItemsPickedDuringEarthquake", "ItemsPickedAfterEarthquake"])
    return items_picked_stats

def get_average_items_picked_stats_for_all_groups(data_file_paths, get_cleaned_data, groups):
    """Returns the average number of items picked before, during and after earthquake for all groups."""
    items_picked_stats = []
    for group in data_file_paths:
        items_picked_stats.append(get_items_picked_stats_for_group(group, data_file_paths, get_cleaned_data, groups))
    items_picked_stats = pd.concat(items_picked_stats)
    return items_picked_stats

tt = get_average_items_picked_stats_for_all_groups(data_file_paths, get_cleaned_data, groups)

In [237]:
t = tt.drop(columns='ID')
t = t.groupby('Group').mean()
t

Unnamed: 0_level_0,ItemsPickedBeforeEarthquake,ItemsPickedDuringEarthquake,ItemsPickedAfterEarthquake
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Group 1,4.5,2.55,6.65
Group 2,4.8,4.1,6.2
Group 3,5.125,1.5625,10.3125
Group 4,4.214286,2.642857,8.214286


### Took table cover during earthquake
For every person we collect : 
* N_T: number of times participant went under table
* A_D: average duration of being inside table
* T_D: total duration inside table
* A_D_E: average duration of being inside table during Earthquake
* T_D_E: total duration inside table during Earthquake

In [369]:
def _get_earthquake_times(df: pd.DataFrame) -> tuple[float, float]:
    """Extracts the start and end times of the earthquake from the DataFrame."""

    earthquake_start_events = df[df["EventType"] == "EarthquakeStart"]
    earthquake_end_events = df[df["EventType"] == "EarthquakeEnd"]
    if not earthquake_start_events.empty and not earthquake_end_events.empty:
        start_time = earthquake_start_events.iloc[0]["Time"]
        end_time = earthquake_end_events.iloc[-1]["Time"]
        return start_time, end_time
    else:
        raise ValueError("EarthquakeStart or EarthquakeEnd event not found in the DataFrame.")

def _prepare_table_cover_events(df: pd.DataFrame) -> pd.DataFrame:
    """Prepares a DataFrame containing only table cover entry and exit events, sorted by index."""

    table_cover_taken = df[df["EventType"] == "EntryUnderTable"]
    table_cover_removed = df[df["EventType"] == "ExitUnderTable"]
    df_t = pd.concat([table_cover_taken, table_cover_removed])
    df_t.sort_index(inplace=True)
    return df_t

def _calculate_duration(start_time: float, end_time: float) -> float:
    """Calculates the duration between two time points."""
    return end_time - start_time

def _calculate_earthquake_overlap_duration(entry_time: float, exit_time: float, earthquake_start_time: float, earthquake_end_time: float) -> float:
    """Calculates the duration of overlap between a table cover event and the earthquake."""

    return max(0, min(exit_time, earthquake_end_time) - max(entry_time, earthquake_start_time))

def get_table_cover_stats(df: pd.DataFrame, participant_id: str, group: str) -> list[str | float]:
    """Calculates statistics related to taking cover under a table during an earthquake for a single participant."""
    
    try:
        earthquake_start_time, earthquake_end_time = _get_earthquake_times(df)
    except ValueError as e:
        print(f"Error for participant {participant_id} in group {group}: {e}")
        return [participant_id, group, 0, 0, 0, 0, 0]

    df_t = _prepare_table_cover_events(df)
    cover_attempts = df_t.shape[0] // 2
    total_duration_in_table_cover = 0
    total_duration_in_table_cover_during_earthquake = 0

    for i in range(0, len(df_t), 2):
        try:
            entry_time = df_t.iloc[i]["Time"]
            exit_time = df_t.iloc[i + 1]["Time"]

            duration_in_table = _calculate_duration(entry_time, exit_time)
            total_duration_in_table_cover += duration_in_table

            overlap_duration = _calculate_earthquake_overlap_duration(
                entry_time, exit_time, earthquake_start_time, earthquake_end_time
            )
            total_duration_in_table_cover_during_earthquake += overlap_duration

        except IndexError:
            print(f"Error: Mismatched entry and exit under table events for participant {participant_id} in group {group}")
            continue

    if cover_attempts > 0:
        average_duration_in_table_cover = total_duration_in_table_cover / cover_attempts
        average_duration_in_table_cover_during_earthquake = total_duration_in_table_cover_during_earthquake / cover_attempts
    else:
        average_duration_in_table_cover = 0
        average_duration_in_table_cover_during_earthquake = 0

    user_stats = [
        participant_id,
        group,
        cover_attempts,
        average_duration_in_table_cover / 1000,
        total_duration_in_table_cover / 1000,
        total_duration_in_table_cover_during_earthquake / 1000,
        average_duration_in_table_cover_during_earthquake / 1000,
    ]
    return user_stats



In [371]:
def get_table_cover_stats_for_group(group, data_file_paths, groups):
    """Calculates statistics related to taking cover under a table during an earthquake for all participants in a group."""
    table_cover_stats = []
    for file_path in data_file_paths[group]:
        df = get_cleaned_data(file_path, groups)
        table_cover_stats.append(get_table_cover_stats(df, os.path.basename(file_path).split(".")[0], group))
    table_cover_stats = pd.DataFrame(
        table_cover_stats,
        columns=[
            "ID",
            "Group",
            "CoverAttempts",
            "AverageDurationInTableCover",
            "TotalDurationInTableCover",
            "TotalDurationInTableCoverDuringEarthquake",
            "AverageDurationInTableCoverDuringEarthquake",
        ],
    )
    return table_cover_stats

def get_table_cover_stats_for_all_groups(data_file_paths, groups):
    """Calculates statistics related to taking cover under a table during an earthquake for all participants in all groups."""
    table_cover_stats = []
    for group in data_file_paths:
        table_cover_stats.append(get_table_cover_stats_for_group(group, data_file_paths, groups))
    table_cover_stats = pd.concat(table_cover_stats)
    return table_cover_stats

tt = get_table_cover_stats_for_all_groups(data_file_paths, groups)

In [372]:
tp = tt.drop(columns='ID')
tp = tp.groupby(['Group',tt["CoverAttempts"] > 0]).count().iloc[:,0].unstack()
tp.rename(columns={False: 'No Cover Attempts', True: 'Cover Attempts'}, inplace=True)
tp

CoverAttempts,No Cover Attempts,Cover Attempts
Group,Unnamed: 1_level_1,Unnamed: 2_level_1
Group 1,8,12
Group 2,15,5
Group 3,2,14
Group 4,5,9


In [373]:
t = tt.drop(columns='ID')
t.groupby('Group').mean()

Unnamed: 0_level_0,CoverAttempts,AverageDurationInTableCover,TotalDurationInTableCover,TotalDurationInTableCoverDuringEarthquake,AverageDurationInTableCoverDuringEarthquake
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Group 1,1.35,6.080008,10.3758,9.19285,5.434333
Group 2,0.4,1.652567,4.2255,0.35135,0.1394
Group 3,1.4375,13.492419,18.141562,15.938375,11.910925
Group 4,0.928571,10.613143,14.280643,9.400071,6.467393


### Player sitting behaviour
* N_S: Number of times the player became seated (transition from False to True).
* N_U: Number of times the player became unseated (transition from True to False).
* N_Seated_Periods: Number of distinct periods the player was seated.
* A_D_Seated: Average duration of being seated.
* T_D_Seated: Total duration of being seated.
* A_D_Seated_E: Average duration of being seated during the Earthquake.
* T_D_Seated_E: Total duration of being seated during the Earthquake.

In [None]:
def get_seated_stats(df: pd.DataFrame, participant_id: str, group: str) -> list[str | float]:
    try:
        earthquake_start_time, earthquake_end_time = _get_earthquake_times(df)
    except ValueError as e:
        print(f"Error for participant {participant_id} in group {group}: {e}")
        return [participant_id, group, 0, 0, 0, 0, 0, 0, 0]  # Added more zeros for new metrics

    seated_count = 0
    unseated_count = 0
    num_seated_periods = 0
    total_seated_duration = 0
    total_seated_duration_during_earthquake = 0
    current_seated_start_time = None

    for _, row in df.iterrows():
        current_time = row['Time']
        is_seated = row['PlayerSeated']

        if is_seated and current_seated_start_time is None:
            current_seated_start_time = current_time
            seated_count += 1
        elif not is_seated and current_seated_start_time is not None:
            duration = current_time - current_seated_start_time
            total_seated_duration += duration
            num_seated_periods += 1
            overlap = _calculate_earthquake_overlap_duration(
                current_seated_start_time, current_time, earthquake_start_time, earthquake_end_time
            )
            total_seated_duration_during_earthquake += overlap
            current_seated_start_time = None
            unseated_count += 1

    # Handle if seated at the end
    if current_seated_start_time is not None:
        duration = df['Time'].iloc[-1] - current_seated_start_time
        total_seated_duration += duration
        num_seated_periods += 1
        overlap = _calculate_earthquake_overlap_duration(
            current_seated_start_time, df['Time'].iloc[-1], earthquake_start_time, earthquake_end_time
        )
        total_seated_duration_during_earthquake += overlap

    average_seated_duration = total_seated_duration / num_seated_periods if num_seated_periods > 0 else 0
    average_seated_duration_during_earthquake = total_seated_duration_during_earthquake / num_seated_periods if num_seated_periods > 0 else 0

    user_stats = [
        participant_id,
        group,
        seated_count,  # N_S
        average_seated_duration / 1000,
        total_seated_duration / 1000,
        total_seated_duration_during_earthquake / 1000,
        average_seated_duration_during_earthquake / 1000,
    ]
    return user_stats

In [379]:
def get_seated_stats_for_group(group, data_file_paths, groups):
    seated_stats = []
    for file_path in data_file_paths[group]:
        df = get_cleaned_data(file_path, groups)
        seated_stats.append(get_seated_stats(df, os.path.basename(file_path).split(".")[0], group))
    seated_stats = pd.DataFrame(
        seated_stats,
        columns=[
            "ID",
            "Group",
            "SeatedCount",
            "AverageSeatedDuration",
            "TotalSeatedDuration",
            "TotalSeatedDurationDuringEarthquake",
            "AverageSeatedDurationDuringEarthquake",
        ],
    )
    return seated_stats

def get_seated_stats_for_all_groups(data_file_paths, groups):
    seated_stats = []
    for group in data_file_paths:
        seated_stats.append(get_seated_stats_for_group(group, data_file_paths, groups))
    seated_stats = pd.concat(seated_stats)
    return seated_stats

tt = get_seated_stats_for_all_groups(data_file_paths, groups)

In [380]:
t = tt.drop(columns='ID')
t.groupby('Group').mean()

Unnamed: 0_level_0,SeatedCount,AverageSeatedDuration,TotalSeatedDuration,TotalSeatedDurationDuringEarthquake,AverageSeatedDurationDuringEarthquake
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Group 1,10.55,3.949328,37.19725,28.5033,3.201339
Group 2,2.75,1.067111,8.9466,2.87925,0.351253
Group 3,11.875,6.98226,58.55925,42.483875,5.446376
Group 4,4.857143,4.160383,28.373286,12.714286,1.362099


In [382]:
tt

Unnamed: 0,ID,Group,SeatedCount,AverageSeatedDuration,TotalSeatedDuration,TotalSeatedDurationDuringEarthquake,AverageSeatedDurationDuringEarthquake
0,11_B22214,Group 1,0,0.000000,0.000,0.000,0.000000
1,13_B22112,Group 1,10,8.106100,81.061,50.649,5.064900
2,15_B22115,Group 1,8,4.997500,39.980,39.074,4.884250
3,17_B2294,Group 1,18,2.127944,38.303,27.274,1.515222
4,19_B22094,Group 1,7,6.498857,45.492,42.993,6.141857
...,...,...,...,...,...,...,...
9,66_B22114,Group 4,4,16.728500,66.914,0.000,0.000000
10,69_B24362,Group 4,14,4.720143,66.082,52.492,3.749429
11,71_B22121,Group 4,12,5.920083,71.041,58.090,4.840833
12,73_B20001,Group 4,1,7.609000,7.609,0.000,0.000000
