In [1]:
%load_ext autoreload
%autoreload 2

# Process UCSD Data

In [4]:
from utils import Dataset

ucsd_ds = Dataset(
    name="ucsd",
    final_grade_column="final_grade",
)

In [5]:
ucsd_data = ucsd_ds.process()

  0%|          | 0/12 [00:00<?, ?it/s]

Total number of students in the dataset: 183
Excluded 20 students without final grade


In [6]:
ucsd_data["label"].value_counts()

0    117
1     66
Name: label, dtype: int64

In [7]:
# if path does not exist, create one
import os

if not os.path.exists("../data/processed"):
    os.makedirs("../data/processed")

In [8]:
ucsd_data.to_csv("../data/processed/ucsd.csv", index=False)

# Process UCLA Session A and C

In [9]:
# Process session a
ucla_session_a = Dataset(
    name="session_a",
    final_grade_column="grade_letter",
)
ucla_session_a_data = ucla_session_a.process()

  0%|          | 0/12 [00:00<?, ?it/s]

Total number of students in the dataset: 60
Excluded 5 students without final grade


In [10]:
# Process session c
ucla_session_c = Dataset(
    name="session_c",
    final_grade_column="grade_letter",
)
ucla_session_c_data = ucla_session_c.process()

  0%|          | 0/12 [00:00<?, ?it/s]

Total number of students in the dataset: 75
Excluded 4 students without final grade


In [11]:
ucla_session_a_data.to_csv("../data/processed/ucla_session_a.csv", index=False)
ucla_session_c_data.to_csv("../data/processed/ucla_session_c.csv", index=False)

In [12]:
import pandas as pd

# Merge the two sessions
ucla_data = pd.concat([ucla_session_a_data, ucla_session_c_data])

In [13]:
ucla_data["label"].value_counts()

0    91
1    44
Name: label, dtype: int64

In [14]:
ucla_data.to_csv("../data/processed/ucla.csv", index=False)

# Generate Dataset for multi-class classification

In [9]:
import pandas as pd
from utils import Dataset


def get_label_statistics(data, label_column="label"):
    """
    Calculate counts and ratios for each unique value in the label column.

    Args:
        data (pd.DataFrame): Input dataframe
        label_column (str): Name of the label column (default: 'label')

    Returns:
        pd.DataFrame: DataFrame containing counts and ratios for each label
    """
    label_counts = data[label_column].value_counts()
    label_ratios = label_counts / len(data)

    stats_df = pd.DataFrame({"counts": label_counts, "ratios": label_ratios})

    # ratio should display as a percentage
    stats_df["ratios"] = stats_df["ratios"].apply(lambda x: f"{x*100:.2f}%")

    return stats_df

In [3]:
ucsd_ds = Dataset(
    name="ucsd",
    final_grade_column="final_grade",
    multi_class=True,
)

ucsd_data = ucsd_ds.process()

  0%|          | 0/12 [00:00<?, ?it/s]

Total number of students in the dataset: 183
Excluded 20 students without final grade


In [10]:
get_label_statistics(ucsd_data)

Unnamed: 0_level_0,counts,ratios
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,117,63.93%
2,42,22.95%
1,24,13.11%


In [11]:
ucsd_data.to_csv("../data/processed/ucsd_multi_class.csv", index=False)

In [12]:
ucla_session_a = Dataset(
    name="session_a",
    final_grade_column="grade_letter",
    multi_class=True,
)
ucla_session_a_data = ucla_session_a.process()

  0%|          | 0/12 [00:00<?, ?it/s]

Total number of students in the dataset: 60
Excluded 5 students without final grade


In [13]:
get_label_statistics(ucla_session_a_data)

Unnamed: 0_level_0,counts,ratios
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,41,68.33%
1,10,16.67%
2,9,15.00%


In [14]:
ucla_session_a_data.to_csv(
    "../data/processed/ucla_session_a_multi_class.csv", index=False
)