In [5]:
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
from sklearn.model_selection import train_test_split
import pathlib
import os
from xgboost import XGBClassifier
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [2]:
SAMPLE_SIZE = 0.2


def preprocess(X: pd.DataFrame, y: pd.Series, sample_size: float = SAMPLE_SIZE):
    # Identify categorical and numeric columns
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    numeric_cols = X.select_dtypes(include=['number']).columns.tolist()
    
    # Initialize lists to store processed columns
    processed_columns = []

    # If there are categorical columns, apply one-hot encoding
    if categorical_cols:
        print("Encoding categorical columns...")
        onehot_encoder = OneHotEncoder(categories='auto')
        X_categorical = pd.DataFrame(onehot_encoder.fit_transform(X[categorical_cols]),
                                     columns=onehot_encoder.get_feature_names_out(categorical_cols))
        processed_columns.append(X_categorical)
    
    # Apply standard scaling to the numeric columns
    if numeric_cols:
        print("Scaling numerical columns...")
        scaler = StandardScaler()
        X_numeric = X[numeric_cols] # pd.DataFrame(scaler.fit_transform(X[numeric_cols]), columns=numeric_cols)
        processed_columns.append(X_numeric)

    # Combine the processed columns
    if processed_columns:
        X_processed = pd.concat(processed_columns, axis=1)
    else:
        X_processed = X.copy()  # If there are no categorical or numeric columns, keep the original dataframe
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.1,stratify=y, random_state=42)   
    
    # # Scale the training and test sets
    # scaler = StandardScaler()
    # X_train = scaler.fit_transform(X_train)
    # X_test = scaler.transform(X_test)
    
    # Sample a subset of the training data
    _, X_sample, _, y_sample = train_test_split(X_train, y_train, test_size=sample_size,stratify=y_train, random_state=42)
    
    # Print the shapes of the datasets
    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)
    print("y_train shape:", y_train.shape)
    print("y_test shape:", y_test.shape)
    print("X_sample shape:", X_sample.shape)
    print("y_sample shape:", y_sample.shape)
    

        
    return X_train, y_train,X_sample, y_sample, X_test, y_test

In [3]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
predict_students_dropout_and_academic_success = fetch_ucirepo(id=697) 
  
# data (as pandas dataframes) 
X = predict_students_dropout_and_academic_success.data.features 
y = predict_students_dropout_and_academic_success.data.targets 
  
# metadata 
print(predict_students_dropout_and_academic_success.metadata) 
  
# variable information 
print(predict_students_dropout_and_academic_success.variables) 


{'uci_id': 697, 'name': "Predict Students' Dropout and Academic Success", 'repository_url': 'https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success', 'data_url': 'https://archive.ics.uci.edu/static/public/697/data.csv', 'abstract': "A dataset created from a higher education institution (acquired from several disjoint databases) related to students enrolled in different undergraduate degrees, such as agronomy, design, education, nursing, journalism, management, social service, and technologies.\nThe dataset includes information known at the time of student enrollment (academic path, demographics, and social-economic factors) and the students' academic performance at the end of the first and second semesters. \nThe data is used to build classification models to predict students' dropout and academic sucess. The problem is formulated as a three category classification task, in which there is a strong imbalance towards one of the classes.", 'area': 'Social Sc

In [4]:
y = y.replace({"Dropout": 1, "Graduate": 0, "Enrolled": 2})
y

  y = y.replace({"Dropout": 1, "Graduate": 0, "Enrolled": 2})


Unnamed: 0,Target
0,1
1,0
2,1
3,0
4,0
...,...
4419,0
4420,1
4421,1
4422,0


In [66]:
X_train, y_train,X_sample, y_sample, X_test, y_test = preprocess(X, y, 0.1)
print(X_train.shape, X_sample.shape, X_test.shape)

Scaling numerical columns...
X_train shape: (3981, 36)
X_test shape: (443, 36)
y_train shape: (3981, 1)
y_test shape: (443, 1)
X_sample shape: (399, 36)
y_sample shape: (399, 1)
(3981, 36) (399, 36) (443, 36)


In [34]:
model = XGBClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, predictions))
print("F1 Score:", f1_score(y_test, predictions, average='macro'))

Accuracy: 0.7697516930022573
F1 Score: 0.7027021417109006


In [35]:
model = XGBClassifier()
model.fit(X_sample, y_sample)
predictions = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, predictions))
print("F1 Score:", f1_score(y_test, predictions, average='macro'))

Accuracy: 0.7200902934537246
F1 Score: 0.6338423919805681


In [46]:
dataset = pd.concat([X, y], axis=1)
dataset.to_csv("../data/students_dropout.csv")

In [60]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
support2 = fetch_ucirepo(id=880) 
  
# data (as pandas dataframes) 
X = support2.data.features 
y = support2.data.targets 
  
# metadata 
print(support2.metadata) 
  
# variable information 
print(support2.variables) 
  


{'uci_id': 880, 'name': 'SUPPORT2', 'repository_url': 'https://archive.ics.uci.edu/dataset/880/support2', 'data_url': 'https://archive.ics.uci.edu/static/public/880/data.csv', 'abstract': "This dataset comprises 9105 individual critically ill patients across 5 United States medical centers, accessioned throughout 1989-1991 and 1992-1994.\nEach row concerns hospitalized patient records who met the inclusion and exclusion criteria for nine disease categories: acute respiratory failure, chronic obstructive pulmonary disease, congestive heart failure, liver disease, coma, colon cancer, lung cancer, multiple organ system failure with malignancy, and multiple organ system failure with sepsis. The goal is to determine these patients' 2- and 6-month survival rates based on several physiologic, demographics, and disease severity information. \nIt is an important problem because it addresses the growing national concern over patients' loss of control near the end of life. It enables earlier deci

In [62]:
# y = y.replace({"Adult": 1, "Senior": 0})
y.value_counts()

death  hospdead  sfdm2              
1      1         <2 mo. follow-up       2261
0      0         no(M2 and SIP pres)    1673
1      0         no(M2 and SIP pres)    1385
                 <2 mo. follow-up        862
                 adl>=4 (>=5 if sur)     506
0      0         adl>=4 (>=5 if sur)     358
1      0         SIP>=30                 292
0      0         SIP>=30                 268
1      1         adl>=4 (>=5 if sur)      52
       0         Coma or Intub            25
0      0         Coma or Intub             9
1      1         Coma or Intub             7
                 SIP>=30                   4
                 no(M2 and SIP pres)       3
Name: count, dtype: int64

In [57]:
X_train, y_train,X_sample, y_sample, X_test, y_test = preprocess(X, y, 0.1)
print(X_train.shape, X_sample.shape, X_test.shape)

Scaling numerical columns...
X_train shape: (2050, 7)
X_test shape: (228, 7)
y_train shape: (2050, 1)
y_test shape: (228, 1)
X_sample shape: (205, 7)
y_sample shape: (205, 1)
(2050, 7) (205, 7) (228, 7)


In [58]:
model = XGBClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, predictions))
print("F1 Score:", f1_score(y_test, predictions, average='macro'))

Accuracy: 0.8289473684210527
F1 Score: 0.5836883749239197


In [59]:
model = XGBClassifier()
model.fit(X_sample, y_sample)
predictions = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, predictions))
print("F1 Score:", f1_score(y_test, predictions, average='macro'))

Accuracy: 0.8026315789473685
F1 Score: 0.5741855156671508
