In [11]:
import pandas as pd
import re
pd.set_option('display.max_rows', 250)  # Show up to 100 rows

# Read the text file into a DataFrame
# The file has an extra tab between Title and Timestamp, creating an empty column
df = pd.read_csv('subjects.txt', sep='\t', header=None, 
                 names=['ID', 'Status', 'Title', 'Empty', 'Timestamp'])

# Drop the empty column
df = df.drop('Empty', axis=1)

# Parse the Title column to extract Assignment and Student Name
# Pattern: <assignment> by <student_name>
df['Assignment'] = df['Title'].str.extract(r'^(.+?)\s+[Bb]y\s+', expand=False)
df['Student_Name'] = df['Title'].str.extract(r'\s+[Bb]y\s+(.+?)$', expand=False)

# Remove the Title column
df = df.drop('Title', axis=1)

# Convert Timestamp to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Split assignments that contain "and" into separate rows
# Extract all Day patterns from the Assignment column
df['Assignment_List'] = df['Assignment'].str.findall(r'Day\s*\d+', flags=re.IGNORECASE)

# Explode the list so each assignment gets its own row
df = df.explode('Assignment_List')

# Use the exploded assignment list as the Assignment_Key
df['Assignment_Key'] = df['Assignment_List']

# Normalize to Day0x format (e.g., "Day08")
df['Assignment_Key'] = df['Assignment_Key'].str.replace(r'\s+', '', regex=True).str.capitalize().str.replace(r'Day(\d)$', r'Day0\1', regex=True)

# Drop the temporary Assignment_List column
df = df.drop('Assignment_List', axis=1)

# Define deadlines - first assignment: 2025-11-01 22:00, then every week at the same time
from datetime import datetime, timedelta
import pytz
base_deadline = datetime(2025, 11, 8, 22, 0, 0, tzinfo=pytz.UTC)
deadlines = {f'Day{i:02d}': base_deadline + timedelta(weeks=i-1) for i in range(1, 9)}

# Map deadline for each assignment (ignore Final Project)
df['Deadline'] = df['Assignment_Key'].map(deadlines)

# Check if submission was in time (before or equal to deadline)
df['Submission_In_Time'] = df['Timestamp'] <= df['Deadline']


df.head()

Unnamed: 0,ID,Status,Timestamp,Assignment,Student_Name,Assignment_Key,Deadline,Submission_In_Time
0,213,OPEN,2026-01-03 18:44:38+00:00,Day08,Shoshana Sernik,Day08,2025-12-27 22:00:00+00:00,False
1,212,OPEN,2026-01-02 15:59:16+00:00,Final Project proposal,Achinoam Shoham,,NaT,False
2,211,OPEN,2026-01-01 15:20:10+00:00,Day08,Einav Litvak,Day08,2025-12-27 22:00:00+00:00,False
3,210,OPEN,2025-12-30 18:03:43+00:00,Day08,Achinoam Shoham,Day08,2025-12-27 22:00:00+00:00,False
4,209,OPEN,2025-12-30 16:26:42+00:00,Day08,Noya Levy,Day08,2025-12-27 22:00:00+00:00,False


In [None]:
# Check for data quality issues
print(f"Total rows in df (after explode): {len(df)}")
print(f"Unique students: {df['Student_Name'].nunique()}")
print(f"\nStudent name value counts:")
print(df['Student_Name'].value_counts().head(20))
print(f"\nSample of student names:")
print(df['Student_Name'].unique()[:10])

(215, 9)