# Loading dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.utils import resample
from sklearn.impute import KNNImputer

from ctgan import CTGAN

In [None]:
target_column = 'dropoutFlag'

fieldKeys = [
  'age',
  'attendancePercentage',
  'lateSubmissionCount',
  'cgpa',
  'previousYearPerformance',
  'mathScore',
  'englishScore',
  'scienceScore',
  'projectScore',
  'totalMarks',
  'feesPaid',
  'libraryDues',
  'sportsScore',
  'behaviorScore',
  'scholarshipEligibility',
  'specialNeedsFlag'
]

ml_fields = fieldKeys + [target_column]
ml_fields

# Loading data and preliminary analysis

In [None]:
real_data = pd.read_csv('../data/real_data.csv')

print(f'Columns: {real_data.columns}')
print(f'Shape: {real_data.shape}')

In [None]:
real_data.head()

In [None]:
real_data.info()

In [None]:
real_data.describe()

In [None]:
# No imputation needed
real_data.isna().sum()

# Data Cleaning

## Dropping unwnated features

In [None]:
required_data = real_data.drop(['Marital status', 'Application mode', 'Application order', 'Unemployment rate','Inflation rate', 'GDP', 'Nacionality', 'Course' ], axis=1)

## Formatting features

In [None]:
# formatting column names
rename_map = {
  'Daytime/evening attendance': 'studyMode',
  'Previous qualification': 'previousEducation',
  'Displaced': 'displacedStatus',
  'Educational special needs': 'specialNeeds',
  'Debtor': 'debtor',
  'Tuition fees up to date': 'feesStatus',
  'Gender': 'gender',
  'Scholarship holder': 'scholarShipStatus',
  'Age at enrollment': 'ageAtEnrollment',
  'International': 'international',
  "Mother's qualification": 'motherEducation',
  "Father's qualification": 'fatherEducation',
  "Father's occupation": "fatherOccupation",
  "Mother's occupation": "motherOccupation",
  "Target": "dropoutStatus"
}

required_data =required_data.rename(rename_map, axis=1)

required_data.columns

## Segragation into separate categories

In [None]:
target_column = 'dropoutStatus'

admission_features = ['studyMode', 'previousEducation', 'motherEducation', 'fatherEducation', 'motherOccupation', 'fatherOccupation', 'displacedStatus', 'specialNeeds', 'gender', 'international', 'ageAtEnrollment']
financial_features = ['debtor', 'feesStatus', 'scholarShipStatus']
performance_features = required_data.drop(admission_features + financial_features + [target_column], axis=1).columns.to_list()

print(f'Admission features: {admission_features}\ntotal: {len(admission_features)}')
print(f'financial features: {financial_features}\ntotal: {len(financial_features)}')
print(f'performance features: {performance_features}\ntotal: {len(performance_features)}')

## Feature engineering

In [None]:
# Parent feature engineering
parent_features = [feature for feature in admission_features if 'mother' in feature or 'father' in feature]

required_data['parentEducation'] = required_data[[feature for feature in parent_features if 'Education' in feature]].max(axis=1)
required_data['parentEmployentStatus'] = np.where(
  required_data['motherOccupation'] == required_data['fatherOccupation'],
  1,
  0
)

In [None]:
# financial feature engineering

required_data['feesPaid'] = np.where(
  (required_data['feesStatus'] == 0) | (required_data['debtor'] == 1),
  0,
  1
)

In [None]:
# target
required_data['dropoutStatus'] = np.where(
  (required_data['dropoutStatus'] == 'Graduate') | (required_data['dropoutStatus'] == 'Enrolled'),
  0,
  1
)

required_data['dropoutStatus'].value_counts()

In [None]:
# Aggregate enrolled and approved
required_data["totalCreditsEnrolled"] = (
    required_data["Curricular units 1st sem (enrolled)"].fillna(0) +
    required_data["Curricular units 2nd sem (enrolled)"].fillna(0)
).astype(float)

required_data["totalCreditsApproved"] = (
    required_data["Curricular units 1st sem (approved)"].fillna(0) +
    required_data["Curricular units 2nd sem (approved)"].fillna(0)
).astype(float)

required_data['notEnrolled'] = (required_data['totalCreditsEnrolled'] == 0).astype(int)
mask_invalid = ((required_data['totalCreditsEnrolled'] > 0) & (required_data['totalCreditsApproved'] == 0) & (required_data['dropoutStatus'] == 0))

# Filtering out invalid data
required_data = required_data[~mask_invalid]

# Handling fail cases
mask_failed = (
    (required_data['totalCreditsEnrolled'] > 0) &
    (required_data['totalCreditsApproved'] == 0)
)

noise = np.random.uniform(0.1, 1.0, mask_failed.sum()).round(2)

required_data.loc[mask_failed, 'totalCreditsApproved'] = pd.Series(
    noise, index=required_data.loc[mask_failed].index
)

# Weighted average grade (CGPA-like measure, 0–20 scale initially)
required_data["cgpa"] = (
    (required_data["Curricular units 1st sem (grade)"].fillna(0) * required_data["Curricular units 1st sem (approved)"].fillna(0) +
     required_data["Curricular units 2nd sem (grade)"].fillna(0) * required_data["Curricular units 2nd sem (approved)"].fillna(0))
    /
    required_data["totalCreditsApproved"].replace(0, np.nan)
)

# Convert 0–20 → 0–10 scale
required_data["cgpa"] = (required_data["cgpa"] / 2).clip(lower=0, upper=10).round(2)

failing_cgpas = np.random.uniform(0, 0.9, mask_failed.sum()).round(2)
required_data.loc[mask_failed, "cgpa"] = failing_cgpas

In [None]:
imputer  = KNNImputer(n_neighbors=3, weights='distance')
imputed = imputer.fit_transform(required_data[['cgpa', 'totalCreditsApproved']])
required_data[['cgpa', 'totalCreditsApproved' ]] = imputed.round(2)

In [None]:
required_data['cgpa'].value_counts()

In [None]:
required_data[['cgpa','totalCreditsApproved', 'totalCreditsEnrolled', 'dropoutStatus', 'notEnrolled']]

## Cleaning up the format and removing features used for feature engineeering

In [None]:
# Extracting segrated academic performance feature as we are replacing them with aggregations
features = [feature for feature in required_data.columns if 'Curricular' in feature]
features

# Final clean data

In [None]:
data = required_data.drop(features + parent_features + ['debtor', 'feesStatus'], axis=1)

In [None]:
data.columns, data.shape

In [None]:
data.head(10)

In [None]:
data.info()

In [None]:
data.to_csv('../data/processed-real-data.csv', index=False)

# EDA

In [None]:
data.columns

In [None]:
# Basic eda

def show_dist(features, df):
    fig, axes = plt.subplots(3, 4, figsize=(15, 10))
    axes = axes.flatten()
    
    for ax, feature in zip(axes, features):
        counts = df[feature].value_counts()
        ax.bar(counts.index.astype(str), counts.values)
        ax.set_xlabel(feature)
        ax.set_ylabel('count')
        ax.set_title(f"Distribution of {feature}")
        ax.tick_params(axis='x', rotation=0)
    
    plt.tight_layout()
    plt.show()

show_dist([    "studyMode",
    "previousEducation",
    "displacedStatus",
    "specialNeeds",
    "gender",
    "scholarShipStatus",
    "international",
    "dropoutStatus",
    "parentEducation",
    "parentEmployentStatus",
    "feesPaid",
    'notEnrolled'], data)

In [None]:
# Label dist, may need oversampling
data['dropoutStatus'].value_counts().plot(kind='bar')
plt.show()

In [None]:
# Continuous feature analysis
continous_features = ['cgpa', 'totalCreditsEnrolled', 'totalCreditsApproved', 'ageAtEnrollment']
data[continous_features].describe()

In [None]:
plt.figure(figsize=(12,9))
sns.boxenplot(data[continous_features])
plt.show()

In [None]:
data['cgpa'].value_counts()

# Data balancing and craeting final, engineered and cleaned data object data

In [None]:
# final oversampling before creating synthetic data
minority = data[data['dropoutStatus'] == 1]
majority = data[data['dropoutStatus'] == 0]

upsampled_data = resample(
  minority,
  replace=True,
  n_samples=len(majority),
  random_state=42
)

balanced_data = pd.concat([majority, upsampled_data])

data = balanced_data

data.shape

In [None]:
# Dist after upsampling
show_dist([    "studyMode",
    "previousEducation",
    "displacedStatus",
    "specialNeeds",
    "gender",
    "scholarShipStatus",
    "international",
    "dropoutStatus",
    "parentEducation",
    "parentEmployentStatus",
    "feesPaid"], data)
data['dropoutStatus'].value_counts().plot(kind='bar')
plt.show()

# Synthetic data creation

In [None]:
train_df = data.copy()

In [None]:
train_df.columns

In [None]:
train_df.head()

In [None]:
train_df.isna().sum()

In [None]:
train_df.info()

In [None]:
categorical_columns = [
  "studyMode",
  "previousEducation",
  "displacedStatus",
  "specialNeeds",
  "gender",
  "scholarShipStatus",
  "international",
  "dropoutStatus",
  "parentEducation",
  "parentEmployentStatus",
  "feesPaid",
  'notEnrolled'
]

continuous_columns = [
  'ageAtEnrollment',
  'totalCreditsEnrolled',
  'totalCreditsApproved',
  'cgpa'
]

In [None]:
# preprocessing final dataset
train_df[continuous_columns] = np.log1p(train_df[continuous_columns])

In [None]:
train_df.head()

In [None]:
generator = CTGAN(
  epochs=1000,
  batch_size=500,
  generator_dim=(128,128),
  discriminator_dim=(128,128)
)

generator.fit(data, categorical_columns)

In [None]:
synthetic_data = generator.sample(n=30000)

In [None]:
train_df[continuous_columns] = np.expm1(train_df[continuous_columns])

In [None]:
train_df[continuous_columns] = train_df[continuous_columns].round(2)

In [None]:
synthetic_data["totalCreditsEnrolled"] = synthetic_data["totalCreditsEnrolled"].round(2).clip(lower=0, upper=200)
synthetic_data['totalCreditsApproved'] = synthetic_data['totalCreditsApproved'].round(2).clip(lower=0, upper=300)
synthetic_data['cgpa'] = synthetic_data['cgpa'].clip(lower=0,upper=10).round(2)
synthetic_data['ageAtEnrollment'] = synthetic_data['ageAtEnrollment'].round()
synthetic_data['notEnrolled'] = (synthetic_data['totalCreditsEnrolled'] == 0).astype(int)

In [None]:
synthetic_data.head()

# Validating synthetic data

## Generic and prliminary manual stat checking

In [None]:
print(synthetic_data[['cgpa','totalCreditsEnrolled','totalCreditsApproved','ageAtEnrollment']].isna().sum())

print((synthetic_data['totalCreditsApproved'] > synthetic_data['totalCreditsEnrolled']).sum())

print(synthetic_data['cgpa'].describe())
print(synthetic_data['ageAtEnrollment'].describe())

for c in categorical_columns:
    if c in synthetic_data.columns:
        print(c, synthetic_data[c].value_counts().head())


## Checking the disctribution closeness of the continous synthetic values with real ones

In [None]:
def show_validation_hist(columns, real, synthetic):
  for column in columns:
    real[column].hist(alpha=0.5, label="Real")
    synthetic[column].hist(alpha=0.5, label="Synthetic")
    plt.title(column)
    plt.legend()
    plt.show()

show_validation_hist(continuous_columns, data,synthetic_data)

In [None]:
synthetic_data['cgpa'].value_counts()

In [None]:
data['cgpa'].value_counts()

In [None]:
synthetic_data

In [None]:
show_dist(categorical_columns, synthetic_data)

In [None]:
synthetic_data['cgpa'].value_counts(), train_df['cgpa'].value_counts()

# Creating the full final model data and exporting it

In [None]:
full_data = pd.concat([data,synthetic_data])
full_data.head()

In [None]:
full_data.shape

In [None]:
full_data.to_csv('../data/final-data.csv', index=False)