# 02. Feature Engineering & Dataset Merging

## Output Files:
1. `clustering_features.csv` - Model 1 (Behavior Clustering)
2. `pace_features.csv` - Model 3 (Pace Analysis)
3. `advice_context.csv` - Model 2 (Advice Generation)

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Paths
INTERIM_DIR = '../data/interim'
PROCESSED_DIR = '../data/processed'
os.makedirs(PROCESSED_DIR, exist_ok=True)

print("‚úÖ Environment Ready!")

‚úÖ Environment Ready!


---
# 1. LOAD CLEANED DATA

In [2]:
print("\n" + "="*60)
print("LOADING ALL CLEANED DATASETS")
print("="*60)

df_users = pd.read_csv(os.path.join(INTERIM_DIR, 'users_clean.csv'))
df_trackings = pd.read_csv(os.path.join(INTERIM_DIR, 'trackings_clean.csv'), 
                          parse_dates=['last_viewed', 'first_opened_at', 'completed_at'])
df_submissions = pd.read_csv(os.path.join(INTERIM_DIR, 'submissions_clean.csv'), 
                            parse_dates=['created_at'])
df_exam_res = pd.read_csv(os.path.join(INTERIM_DIR, 'exam_results_clean.csv'), 
                         parse_dates=['created_at'])
df_completions = pd.read_csv(os.path.join(INTERIM_DIR, 'completions_clean.csv'), 
                            parse_dates=['created_at', 'last_enrolled_at'])
df_journeys = pd.read_csv(os.path.join(INTERIM_DIR, 'journeys_clean.csv'))
df_tutorials = pd.read_csv(os.path.join(INTERIM_DIR, 'tutorials_clean.csv'))
df_exam_reg = pd.read_csv(os.path.join(INTERIM_DIR, 'exam_registrations_clean.csv'), 
                         parse_dates=['created_at', 'exam_finished_at'])

print("\nüìÇ Loaded Files:")
print(f"  Users: {len(df_users):,} rows")
print(f"  Trackings: {len(df_trackings):,} rows")
print(f"  Submissions: {len(df_submissions):,} rows")
print(f"  Exam Results: {len(df_exam_res):,} rows")
print(f"  Completions: {len(df_completions):,} rows")
print(f"  Journeys: {len(df_journeys):,} rows")
print(f"  Tutorials: {len(df_tutorials):,} rows")
print(f"  Exam Registrations: {len(df_exam_reg):,} rows")


LOADING ALL CLEANED DATASETS

üìÇ Loaded Files:
  Users: 31 rows
  Trackings: 101,689 rows
  Submissions: 2,262 rows
  Exam Results: 17,438 rows
  Completions: 1,032 rows
  Journeys: 176 rows
  Tutorials: 9,682 rows
  Exam Registrations: 16,759 rows


---
# 2. DATASET 1: CLUSTERING FEATURES (MODEL 1)

**Required Features**:
1. `avg_study_hour` - From trackings.last_viewed
2. `study_consistency_std` - Std of study gaps
3. `completion_speed` - study_duration / hours_to_study
4. `avg_exam_score` - From exam_results
5. `submission_fail_rate` - From submissions.status
6. `retry_count` - From completions.enrolling_times

In [3]:
print("\n" + "="*60)
print("BUILDING DATASET 1: CLUSTERING FEATURES")
print("="*60)

# === 2.1 Aggregate Trackings ===
print("\nüîß Aggregating Trackings...")

tracking_agg = df_trackings.groupby(['developer_id', 'journey_id']).agg({
    'tutorial_id': 'count',
    'last_viewed': ['min', 'max'],
    'completed_at': 'count',
}).reset_index()

tracking_agg.columns = ['developer_id', 'journey_id', 'total_modules_viewed', 
                        'first_activity', 'last_activity', 'completed_modules']

# Feature 1: avg_study_hour
temp_hours = df_trackings.dropna(subset=['last_viewed']).copy()
temp_hours['hour'] = temp_hours['last_viewed'].dt.hour
hour_agg = temp_hours.groupby(['developer_id', 'journey_id'])['hour'].mean().reset_index()
hour_agg.columns = ['developer_id', 'journey_id', 'avg_study_hour']
tracking_agg = tracking_agg.merge(hour_agg, on=['developer_id', 'journey_id'], how='left')

# Feature 2: study_consistency_std (as per design doc)
temp_dates = df_trackings.dropna(subset=['last_viewed']).copy()
temp_dates['date'] = temp_dates['last_viewed'].dt.date
temp_dates = temp_dates.sort_values(['developer_id', 'journey_id', 'date'])

def calculate_consistency_std(group):
    unique_dates = sorted(group['date'].unique())
    if len(unique_dates) <= 1:
        return 0
    gaps = [(unique_dates[i] - unique_dates[i-1]).days for i in range(1, len(unique_dates))]
    return np.std(gaps) if gaps else 0

consistency = temp_dates.groupby(['developer_id', 'journey_id']).apply(
    calculate_consistency_std
).reset_index()
consistency.columns = ['developer_id', 'journey_id', 'study_consistency_std']
tracking_agg = tracking_agg.merge(consistency, on=['developer_id', 'journey_id'], how='left')

# Also keep ratio for additional insight
temp_dates_ratio = df_trackings.dropna(subset=['last_viewed']).copy()
temp_dates_ratio['date'] = temp_dates_ratio['last_viewed'].dt.date
consistency_ratio = temp_dates_ratio.groupby(['developer_id', 'journey_id'])['date'].apply(
    lambda x: x.nunique() / ((x.max() - x.min()).days + 1) if (x.max() - x.min()).days > 0 else 1
).reset_index()
consistency_ratio.columns = ['developer_id', 'journey_id', 'study_consistency_ratio']
tracking_agg = tracking_agg.merge(consistency_ratio, on=['developer_id', 'journey_id'], how='left')

print(f"   ‚úì Tracking features: {tracking_agg.shape}")
print(f"   ‚úì Created: avg_study_hour, study_consistency_std, study_consistency_ratio")


BUILDING DATASET 1: CLUSTERING FEATURES

üîß Aggregating Trackings...
   ‚úì Tracking features: (2013, 9)
   ‚úì Created: avg_study_hour, study_consistency_std, study_consistency_ratio


In [4]:
# === 2.2 Aggregate Submissions ===
print("\nüîß Aggregating Submissions...")

df_submissions['is_passed'] = df_submissions['status'].apply(
    lambda x: 1 if x in ['passed', 'approved'] else 0 if pd.notna(x) else np.nan
)

submission_agg = df_submissions.groupby(['submitter_id', 'journey_id']).agg({
    'rating': 'mean',
    'is_passed': ['mean', 'sum', 'count'],
    'submission_duration': 'mean'
}).reset_index()

submission_agg.columns = ['developer_id', 'journey_id', 'avg_submission_rating', 
                          'submission_pass_rate', 'submissions_passed', 'total_submissions',
                          'avg_submission_duration']

submission_agg['submission_fail_count'] = submission_agg['total_submissions'] - submission_agg['submissions_passed']
submission_agg['submission_fail_rate'] = submission_agg['submission_fail_count'] / submission_agg['total_submissions']

# Handle division by zero
submission_agg['submission_fail_rate'] = submission_agg['submission_fail_rate'].replace([np.inf, -np.inf], 0)

print(f"   ‚úì Submission features: {submission_agg.shape}")


üîß Aggregating Submissions...
   ‚úì Submission features: (549, 9)


In [5]:
# === 2.3 Aggregate Exam Results ===
print("\nüîß Aggregating Exam Results...")

exam_full = df_exam_res.merge(df_exam_reg[['id', 'examinees_id', 'tutorial_id']], 
                               left_on='exam_registration_id', right_on='id', how='left')
exam_full = exam_full.merge(df_tutorials[['id', 'developer_journey_id']], 
                            left_on='tutorial_id', right_on='id', how='left', suffixes=('', '_tutorial'))

exam_agg = exam_full.groupby(['examinees_id', 'developer_journey_id']).agg({
    'score': 'mean',
    'is_passed': ['mean', 'sum', 'count']
}).reset_index()

exam_agg.columns = ['developer_id', 'journey_id', 'avg_exam_score', 
                    'exam_pass_rate', 'exams_passed', 'total_exams']
exam_agg['exam_fail_count'] = exam_agg['total_exams'] - exam_agg['exams_passed']

print(f"   ‚úì Exam features: {exam_agg.shape}")


üîß Aggregating Exam Results...
   ‚úì Exam features: (1352, 7)


In [6]:
# === 2.4 Process Completions ===
print("\nüîß Processing Completions...")

completion_features = df_completions[['user_id', 'journey_id', 'study_duration', 
                                       'enrolling_times', 'avg_submission_rating']].copy()
completion_features.columns = ['developer_id', 'journey_id', 'study_duration', 
                               'retry_count', 'completion_avg_rating']

print(f"   ‚úì Completion features: {completion_features.shape}")


üîß Processing Completions...
   ‚úì Completion features: (1032, 5)


In [7]:
# === 2.5 Merge All Features ===
print("\nüîß Merging all features...")

clustering_df = tracking_agg.copy()
clustering_df = clustering_df.merge(submission_agg, on=['developer_id', 'journey_id'], how='left')
clustering_df = clustering_df.merge(exam_agg, on=['developer_id', 'journey_id'], how='left')
clustering_df = clustering_df.merge(completion_features, on=['developer_id', 'journey_id'], how='left')
clustering_df = clustering_df.merge(df_journeys[['id', 'name', 'difficulty', 'hours_to_study']], 
                                   left_on='journey_id', right_on='id', how='left', suffixes=('', '_journey'))

print(f"   ‚úì Merged dataset shape: {clustering_df.shape}")
print(f"   ‚úì Unique users: {clustering_df['developer_id'].nunique()}")
print(f"   ‚úì Unique journeys: {clustering_df['journey_id'].nunique()}")


üîß Merging all features...
   ‚úì Merged dataset shape: (2013, 28)
   ‚úì Unique users: 31
   ‚úì Unique journeys: 181


In [8]:
# === 2.6 Feature Engineering - Derived Features ===
print("\nüîß Creating derived features...")

# 1. Completion speed (infinity-safe)
clustering_df['completion_speed'] = np.where(
    (clustering_df['hours_to_study'] > 0) & (clustering_df['study_duration'].notna()),
    clustering_df['study_duration'] / clustering_df['hours_to_study'],
    np.nan
)
clustering_df['completion_speed'] = clustering_df['completion_speed'].clip(upper=10)

# 2. Performance score (composite)
clustering_df['performance_score'] = (
    clustering_df['avg_exam_score'].fillna(0) * 0.4 + 
    clustering_df['avg_submission_rating'].fillna(0) * 20 * 0.6
)

# 3. Struggle score (composite)
clustering_df['struggle_score'] = (
    clustering_df['exam_fail_count'].fillna(0) + 
    clustering_df['submission_fail_count'].fillna(0) * 2
)

# 4. Speed Category Binning
clustering_df['speed_category'] = pd.cut(
    clustering_df['completion_speed'],
    bins=[0, 0.7, 1.3, float('inf')],
    labels=['Fast (< 70%)', 'Normal (70-130%)', 'Slow (> 130%)']
)

# 5. Study Time Slot Binning
clustering_df['study_time_slot'] = pd.cut(
    clustering_df['avg_study_hour'], 
    bins=[0, 6, 12, 18, 24], 
    labels=['Night (0-6)', 'Morning (6-12)', 'Afternoon (12-18)', 'Evening (18-24)']
)

# 6. Performance Level Binning
clustering_df['performance_level'] = pd.cut(
    clustering_df['performance_score'],
    bins=[0, 40, 70, 100],
    labels=['Low', 'Medium', 'High']
)

print("‚úÖ Derived features created (infinity-safe)")
print(f"   completion_speed range: {clustering_df['completion_speed'].min():.2f} - {clustering_df['completion_speed'].max():.2f}")


üîß Creating derived features...
‚úÖ Derived features created (infinity-safe)
   completion_speed range: 0.00 - 10.00


In [9]:
# === 2.7 Comprehensive NaN Handling ===
print("\nüîß Comprehensive NaN handling for ALL columns...")

# A. Remove rows with missing journey info
rows_before = len(clustering_df)
clustering_df = clustering_df.dropna(subset=['id', 'name'])
rows_after = len(clustering_df)
print(f"   ‚úì Removed {rows_before - rows_after} rows with missing journey info")

# B. Count/failure columns: fill with 0
print("\n   üîß Filling count columns with 0...")
count_cols = [
    'submission_fail_count', 'submissions_passed', 'total_submissions',
    'exams_passed', 'total_exams', 'exam_fail_count',
    'retry_count', 'completed_modules', 'total_modules_viewed'
]

for col in count_cols:
    if col in clustering_df.columns:
        nan_before = clustering_df[col].isnull().sum()
        if nan_before > 0:
            clustering_df[col] = clustering_df[col].fillna(0)
            print(f"      {col}: filled {nan_before} NaN ‚Üí 0")

# C. Rate columns: fill with 0
print("\n   üîß Filling rate columns with 0...")
rate_cols = ['submission_pass_rate', 'submission_fail_rate', 'exam_pass_rate']

for col in rate_cols:
    if col in clustering_df.columns:
        nan_before = clustering_df[col].isnull().sum()
        if nan_before > 0:
            clustering_df[col] = clustering_df[col].fillna(0)
            print(f"      {col}: filled {nan_before} NaN ‚Üí 0")

# D. Rating/duration columns: fill with median
print("\n   üîß Filling rating columns with median...")
duration_rating_cols = [
    'avg_submission_rating', 'avg_submission_duration',
    'completion_avg_rating', 'study_duration'
]

for col in duration_rating_cols:
    if col in clustering_df.columns:
        nan_before = clustering_df[col].isnull().sum()
        if nan_before > 0:
            if clustering_df[col].notna().any():
                fill_value = clustering_df[col].median()
                clustering_df[col] = clustering_df[col].fillna(fill_value)
                print(f"      {col}: filled {nan_before} NaN ‚Üí median ({fill_value:.2f})")
            else:
                clustering_df[col] = clustering_df[col].fillna(0)
                print(f"      {col}: filled {nan_before} NaN ‚Üí 0 (no valid data)")

# E. Other numeric columns
print("\n   üîß Filling other numeric columns...")
other_numeric_cols = [
    'avg_exam_score', 'avg_study_hour', 'study_consistency_std', 
    'study_consistency_ratio', 'performance_score', 'struggle_score',
    'completion_speed', 'difficulty', 'hours_to_study'
]

for col in other_numeric_cols:
    if col in clustering_df.columns:
        nan_before = clustering_df[col].isnull().sum()
        if nan_before > 0:
            if clustering_df[col].notna().any():
                fill_value = clustering_df[col].median()
                clustering_df[col] = clustering_df[col].fillna(fill_value)
                print(f"      {col}: filled {nan_before} NaN ‚Üí median ({fill_value:.2f})")
            else:
                clustering_df[col] = clustering_df[col].fillna(0)
                print(f"      {col}: filled {nan_before} NaN ‚Üí 0")

# F. Categorical columns  
print("\n   üîß Filling categorical columns...")
categorical_cols = ['study_time_slot', 'performance_level', 'speed_category']

for col in categorical_cols:
    if col in clustering_df.columns:
        nan_before = clustering_df[col].isnull().sum()
        if nan_before > 0:
            mode_values = clustering_df[col].dropna().mode()
            if not mode_values.empty:
                fill_value = mode_values[0]
                clustering_df[col] = clustering_df[col].fillna(fill_value)
                print(f"      {col}: filled {nan_before} NaN ‚Üí mode ('{fill_value}')")
            else:
                clustering_df[col] = clustering_df[col].fillna('Unknown')
                print(f"      {col}: filled {nan_before} NaN ‚Üí 'Unknown'")


üîß Comprehensive NaN handling for ALL columns...
   ‚úì Removed 5 rows with missing journey info

   üîß Filling count columns with 0...
      submission_fail_count: filled 1459 NaN ‚Üí 0
      submissions_passed: filled 1459 NaN ‚Üí 0
      total_submissions: filled 1459 NaN ‚Üí 0
      exams_passed: filled 661 NaN ‚Üí 0
      total_exams: filled 661 NaN ‚Üí 0
      exam_fail_count: filled 661 NaN ‚Üí 0
      retry_count: filled 976 NaN ‚Üí 0

   üîß Filling rate columns with 0...
      submission_pass_rate: filled 1459 NaN ‚Üí 0
      submission_fail_rate: filled 1459 NaN ‚Üí 0
      exam_pass_rate: filled 661 NaN ‚Üí 0

   üîß Filling rating columns with median...
      avg_submission_rating: filled 1459 NaN ‚Üí median (2.62)
      avg_submission_duration: filled 1459 NaN ‚Üí median (180.00)
      completion_avg_rating: filled 976 NaN ‚Üí median (4.00)
      study_duration: filled 976 NaN ‚Üí median (16.00)

   üîß Filling other numeric columns...
      avg_exam_score: filled

In [10]:
# === 2.8 Final Validation ===
print("\n" + "="*60)
print("FINAL VALIDATION - CLUSTERING DATASET")
print("="*60)

# Core features for clustering
core_features = [
    'avg_study_hour',
    'study_consistency_std', 
    'completion_speed',
    'avg_exam_score',
    'submission_fail_rate',
    'retry_count'
]

print("\nüîç Checking 6 core clustering features:")
all_clean = True
for feature in core_features:
    if feature in clustering_df.columns:
        nan_count = clustering_df[feature].isnull().sum()
        inf_count = np.isinf(clustering_df[feature]).sum()
        
        if nan_count == 0 and inf_count == 0:
            print(f"   ‚úÖ {feature}: Clean")
        else:
            print(f"   ‚ùå {feature}: {nan_count} NaN, {inf_count} infinity")
            all_clean = False
    else:
        print(f"   ‚ùå {feature}: Column not found!")
        all_clean = False

if all_clean:
    print("\nüéâ ALL CORE FEATURES ARE READY FOR CLUSTERING!")
    print(f"   Dataset shape: {clustering_df.shape}")
    
    # Show descriptive statistics
    print("\nüìä Descriptive statistics for core features:")
    display(clustering_df[core_features].describe())
else:
    print("\n‚ö†Ô∏è  WARNING: Some core features have issues!")

# Duplicate check
print("\nüîç DUPLICATE CHECK:")
duplicates = clustering_df[clustering_df.duplicated(subset=['developer_id', 'journey_id'], keep=False)]
if len(duplicates) > 0:
    print(f"   ‚ö†Ô∏è  Found {len(duplicates)} duplicate user-journey pairs!")
    display(duplicates[['developer_id', 'journey_id', 'name']].head())
else:
    print("   ‚úÖ No duplicates found")


FINAL VALIDATION - CLUSTERING DATASET

üîç Checking 6 core clustering features:
   ‚úÖ avg_study_hour: Clean
   ‚úÖ study_consistency_std: Clean
   ‚úÖ completion_speed: Clean
   ‚úÖ avg_exam_score: Clean
   ‚úÖ submission_fail_rate: Clean
   ‚úÖ retry_count: Clean

üéâ ALL CORE FEATURES ARE READY FOR CLUSTERING!
   Dataset shape: (2008, 34)

üìä Descriptive statistics for core features:


Unnamed: 0,avg_study_hour,study_consistency_std,completion_speed,avg_exam_score,submission_fail_rate,retry_count
count,2008.0,2008.0,2008.0,2008.0,2008.0,2008.0
mean,13.815199,63.58396,1.076897,81.635622,0.273406,0.637948
std,3.901394,102.130422,1.932933,12.8863,0.445819,0.776392
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,11.47408,0.0,0.533333,79.294201,0.0,0.0
50%,13.919643,5.857738,0.56,83.846154,0.0,1.0
75%,16.387234,97.251981,0.590217,88.571429,1.0,1.0
max,23.0,761.817709,10.0,100.0,1.0,9.0



üîç DUPLICATE CHECK:
   ‚úÖ No duplicates found


In [11]:
# === 2.9 Select Final Columns & Save ===
clustering_final_cols = [
    'developer_id', 'journey_id', 'name',
    'avg_study_hour', 'study_consistency_std', 'study_consistency_ratio',
    'completed_modules', 'total_modules_viewed',
    'avg_exam_score', 'exam_pass_rate', 'exam_fail_count',
    'avg_submission_rating', 'submission_pass_rate', 'submission_fail_count', 'submission_fail_rate',
    'completion_speed', 'retry_count',
    'performance_score', 'struggle_score',
    'study_time_slot', 'performance_level', 'speed_category', 'difficulty'
]

clustering_final = clustering_df[clustering_final_cols].copy()

print(f"\n‚úÖ Final clustering dataset: {clustering_final.shape}")

# Save
output_path = os.path.join(PROCESSED_DIR, 'clustering_features.csv')
clustering_final.to_csv(output_path, index=False)
print(f"üíæ Saved: {output_path}")


‚úÖ Final clustering dataset: (2008, 23)
üíæ Saved: ../data/processed\clustering_features.csv


---
# 3. DATASET 2: PACE FEATURES (MODEL 3)

In [12]:
print("\n" + "="*60)
print("BUILDING DATASET 2: PACE FEATURES")
print("="*60)

pace_cols = [
    'developer_id', 'journey_id', 'name', 'difficulty', 'hours_to_study',
    'study_duration', 'completion_speed',
    'completed_modules', 'total_modules_viewed',
    'avg_study_hour', 'study_consistency_std', 'study_consistency_ratio'
]

pace_df = clustering_df[pace_cols].dropna(subset=['study_duration']).copy()

# Calculate percentile rank per journey
print("üîß Calculating percentile ranks per journey...")
for journey_id in pace_df['journey_id'].unique():
    mask = pace_df['journey_id'] == journey_id
    pace_df.loc[mask, 'speed_percentile'] = pace_df.loc[mask, 'study_duration'].rank(pct=True) * 100

# Speed category
pace_df['speed_category'] = pd.cut(
    pace_df['completion_speed'],
    bins=[0, 0.7, 1.3, float('inf')],
    labels=['Fast (< 70%)', 'Normal (70-130%)', 'Slow (> 130%)']
)

print(f"‚úÖ Pace dataset: {pace_df.shape}")

# Save
output_path = os.path.join(PROCESSED_DIR, 'pace_features.csv')
pace_df.to_csv(output_path, index=False)
print(f"üíæ Saved: {output_path}")


BUILDING DATASET 2: PACE FEATURES
üîß Calculating percentile ranks per journey...
‚úÖ Pace dataset: (2008, 14)
üíæ Saved: ../data/processed\pace_features.csv


---
# 4. DATASET 3: ADVICE CONTEXT (MODEL 2)

In [13]:
print("\n" + "="*60)
print("BUILDING DATASET 3: ADVICE CONTEXT")
print("="*60)

# === 4.1 Find Stuck Tutorial (As per design doc requirement) ===
print("üîß Finding stuck tutorials (last incomplete tutorial)...")

stuck_tutorials = df_trackings[df_trackings['completed_at'].isnull()].copy()
stuck_tutorials = stuck_tutorials.sort_values('last_viewed', ascending=False)
stuck_tutorials = stuck_tutorials.groupby(['developer_id', 'journey_id']).first().reset_index()
stuck_tutorials = stuck_tutorials[['developer_id', 'journey_id', 'tutorial_id']]
stuck_tutorials.columns = ['developer_id', 'journey_id', 'stuck_tutorial_id']

print(f"   ‚úì Found {len(stuck_tutorials)} users with stuck info")

# === 4.2 Create Advice Context ===
advice_cols = [
    'developer_id', 'journey_id', 'name',
    'avg_study_hour', 'study_time_slot',
    'avg_exam_score', 'exam_fail_count',
    'avg_submission_rating', 'submission_fail_count',
    'completion_speed',
    'performance_level', 'struggle_score'
]

advice_df = clustering_df[advice_cols].copy()

# Add speed_category from clustering_df (now it exists!)
advice_df['speed_category'] = clustering_df['speed_category']

# Add user display name
advice_df = advice_df.merge(df_users[['id', 'display_name']], 
                           left_on='developer_id', right_on='id', how='left')

# Add stuck tutorial info
advice_df = advice_df.merge(stuck_tutorials, on=['developer_id', 'journey_id'], how='left')

# Placeholders for model outputs
advice_df['cluster_label'] = None  # Will be populated by Model 1
advice_df['pace_insight'] = None   # Will be populated by Model 3

print(f"‚úÖ Advice context dataset: {advice_df.shape}")
print(f"   ‚úì Users with display names: {advice_df['display_name'].notna().sum()}")
print(f"   ‚úì Users with stuck info: {advice_df['stuck_tutorial_id'].notna().sum()}")

# Save
output_path = os.path.join(PROCESSED_DIR, 'advice_context.csv')
advice_df.to_csv(output_path, index=False)
print(f"üíæ Saved: {output_path}")


BUILDING DATASET 3: ADVICE CONTEXT
üîß Finding stuck tutorials (last incomplete tutorial)...
   ‚úì Found 1161 users with stuck info
‚úÖ Advice context dataset: (2008, 18)
   ‚úì Users with display names: 2008
   ‚úì Users with stuck info: 1161
üíæ Saved: ../data/processed\advice_context.csv
