In [None]:
# ! kaggle competitions download -c playground-series-s4e6

In [None]:
# ! unzip playground-series-s4e6.zip

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import optuna

In [None]:
X = pd.read_csv('train.csv')
y = pd.read_csv('test.csv')

In [None]:
X.head()

In [None]:
X.info()

In [None]:
X.describe(include='all')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set up the visualizations
plt.figure(figsize=(20, 15))

# Visualize the distribution of the target variable
plt.subplot(3, 2, 1)
sns.countplot(data=X, x='Target')
plt.title('Distribution of Target Variable')

# Visualize the distribution of 'Age at enrollment'
plt.subplot(3, 2, 2)
sns.histplot(data=X, x='Age at enrollment', kde=True, bins=30)
plt.title('Distribution of Age at Enrollment')

# Visualize the distribution of 'Admission grade'
plt.subplot(3, 2, 3)
sns.histplot(data=X, x='Admission grade', kde=True, bins=30)
plt.title('Distribution of Admission Grade')

# Visualize the distribution of 'Gender'
plt.subplot(3, 2, 4)
sns.countplot(data=X, x='Gender')
plt.title('Distribution of Gender')

# Visualize the distribution of 'Scholarship holder'
plt.subplot(3, 2, 5)
sns.countplot(data=X, x='Scholarship holder')
plt.title('Distribution of Scholarship Holders')

# Visualize the relationship between 'Admission grade' and 'Target'
plt.subplot(3, 2, 6)
sns.boxplot(data=X, x='Target', y='Admission grade')
plt.title('Admission Grade vs. Target')

plt.tight_layout()
plt.show()

In [None]:
len(X.columns[X.nunique() < 40])

In [None]:
# Check for class imbalance in the targelt.xlabel('Target')
# plt.ylabel('Count')
# plt.show(

# Visualize the relationships between features and the target variable
# Selecting a few features for visualization


features_to_visualize = X.columns[X.nunique() < 40]# Plotting the distributions of these features with respect to the target variable
fig, axs = plt.subplots(nrows=10, ncols=3, figsize=(18, 36))
for ax, feature in zip(axs.flatten(), features_to_visualize):
    sns.countplot(data=X, x=feature, hue='Target', palette='viridis', ax=ax)
    ax.set_title(f'{feature} Distribution by Target')
    ax.set_xlabel(feature)
    ax.set_ylabel('Count')
    ax.legend(title='Target')

plt.tight_layout()
plt.show()


In [None]:
X.head()

In [None]:
def preprocess(data):
    dict = {
        'Enrolled': 1,
        'Dropout': 0,
        'Graduate': 2,
    }
    data['Target'] = data['Target'].map(dict)
    return data
X = preprocess(X)
X

In [None]:
def new_features(data):
    data['Total_Curricular_units_enrolled'] = data['Curricular units 1st sem (enrolled)'] + data['Curricular units 2nd sem (enrolled)']
    data['Total_Curricular_units_approved'] = data['Curricular units 1st sem (approved)'] + data['Curricular units 2nd sem (approved)']
    data['Total_Curricular_units_grade'] = data['Curricular units 1st sem (grade)'] + data['Curricular units 2nd sem (grade)']
    data['Average_Curricular_units_grade'] = data['Total_Curricular_units_grade'] / (data['Total_Curricular_units_enrolled'] + 1e-9)
    data['Performance_ratio'] = data['Total_Curricular_units_approved'] / (data['Total_Curricular_units_enrolled'] + 1e-9)
    data['tuition_debtor'] = ((data['Tuition fees up to date'] == 0) & (data['Debtor'] == 1)).astype(int)

    data['Academic_growth'] = data['Curricular units 2nd sem (grade)'] - data['Curricular units 1st sem (grade)']
    data['Stability_index'] = abs(data['Curricular units 1st sem (enrolled)'] - data['Curricular units 2nd sem (enrolled)']) + abs(data['Curricular units 1st sem (approved)'] - data['Curricular units 2nd sem (approved)'])
    data['Financial_strain'] = ((data['Scholarship holder'] == 0) & (data['tuition_debtor'] == 1)).astype(int)

    return data

X = new_features(X)

In [None]:
from catboost import CatBoost, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

y = X['Target']
X.drop(columns=['id', 'Target'], inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# train_data = Pool(data=X_train, label=y_train, cat_features=['tuition_debtor', 'Debtor', 'Tuition fees up to date'])
# test_data = Pool(data=X_test, cat_features=['tuition_debtor', 'Debtor', 'Tuition fees up to date'])
params = {'iterations': 1000,  "learning_rate": 0.1, 'depth': 10, 'loss_function': 'MultiClass',
                'cat_features': ['Marital status', 'Application mode', 'Nacionality', 'Gender', 
                                'Scholarship holder', 'Debtor', 'Tuition fees up to date', 
                                'Previous qualification', "Mother's qualification", "Father's qualification",
                                "Mother's occupation", "Father's occupation", 'Daytime/evening attendance', 'tuition_debtor'
                                ],
                'verbose': 0, 'task_type': 'GPU', 'devices': '0-7'}
model = CatBoost(params=params)

model.fit(X_train, y_train)
pred = model.predict(X_test, prediction_type='Class')
pred

In [None]:
accuracy_score(y_test, pred)

In [None]:
# cat_features = [
#     'Marital status', 'Application mode', 'Nacionality', 'Gender', 
#     'Scholarship holder', 'Debtor', 'Tuition fees up to date', 
#     'Previous qualification', "Mother's qualification", "Father's qualification",
#     "Mother's occupation", "Father's occupation", 'Daytime/evening attendance', 'tuition_debtor'
# ]
feature_importances = model.get_feature_importance(Pool(X_train, y_train), type="FeatureImportance")
feature_names = X_train.columns

# Create a dataframe for plotting
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# Sort the dataframe by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot the feature importance
plt.figure(figsize=(12, 8))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance')
plt.gca().invert_yaxis()
plt.show()

In [None]:
import lightgbm as lgb

lgb_tr = lgb.Dataset(X_train, y_train)
params = {'objective': 'multiclass', 'num_class': '3',
          'metric': 'multi_logloss', 'eta': 0.01, 'verbosity': -1,
          'device': 'gpu', 'gpu_platform_id': 0, 'gpu_device_id': 0,
          'num_gpu': '7'}

model = lgb.train(
    params, lgb_tr, num_boost_round=1000,
    valid_sets=[lgb_tr], valid_names=['train'], callbacks=[lgb.log_evaluation(period=50)]
)

In [None]:
import numpy as np

pred = model.predict(X_test)
pred = np.argmax(pred, axis=1)
accuracy_score(y_test, pred)

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

y = X['Target']
X.drop(columns=['id', 'Target'], inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Initialize the CatBoost model
catboost_model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=10,
                                    verbose=0, task_type='GPU', 
                                    devices='0-7'
                                    )

# Train the model on the training set
catboost_model.fit(X_train, y_train)

# Step 3: Plot Feature Importance
feature_importances = catboost_model.get_feature_importance(Pool(X_train, y_train), type="FeatureImportance")
feature_names = X_train.columns

# Create a dataframe for plotting
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# Sort the dataframe by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot the feature importance
plt.figure(figsize=(12, 8))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance')
plt.gca().invert_yaxis()
plt.show()


In [None]:
from sklearn.metrics import accuracy_score
def objective(trial):
    model = CatBoostClassifier(
        cat_features=['tuition_debtor', 'Debtor', 'Tuition fees up to date', ],
        iterations=trial.suggest_int("iterations", 500, 2000),
        learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
        depth=trial.suggest_int("depth", 2, 6),
        l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        bootstrap_type=trial.suggest_categorical("bootstrap_type", ["Bayesian"]),
        random_strength=trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        bagging_temperature=trial.suggest_float("bagging_temperature", 0.0, 10.0),
        od_type=trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        od_wait=trial.suggest_int("od_wait", 10, 50),
        verbose=False,
        task_type='CPU',
        loss_function='MultiClass'
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

In [None]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

bohb_sampler = optuna.samplers.TPESampler(n_startup_trials=30, n_ei_candidates=13, multivariate=True)
study = optuna.create_study(sampler=bohb_sampler, direction='maximize')
study.optimize(objective, n_trials=300, show_progress_bar=True, n_jobs=-1)

In [None]:
from optuna.visualization import plot_slice

plot_slice(study)

In [None]:
# from catboost import CatBoostClassifier
# import numpy as np
# cat_features = ['Marital status', 'Application mode', 'Daytime/evening attendance', 
#                 'Nacionality', "Mother's qualification", "Father's qualification", 
#                 "Mother's occupation", "Father's occupation", 'Course']

# # Initialize the CatBoost model
# classifier = CatBoostClassifier(
#     n_estimators=1000,
#     depth=3,  # Adjusted depth for better learning capacity
#     loss_function='MultiClass',
#     verbose=False,
#     learning_rate=0.1,
#     cat_features=cat_features,
#     eval_metric='Accuracy',
#     task_type='GPU',
#     devices='0-7'
# )

# # Define the parameter grid
# param_grid = {
#     'l2_leaf_reg': np.linspace(0, 5, 10)
# }

# # Perform grid search
# classifier.grid_search(param_grid, X_train, y_train, cv=5, plot=True, refit=True, verbose=False)

# # Evaluate the model on the test set
# y_pred = classifier.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Test Set Accuracy: {accuracy:.2f}")

In [None]:
# X = pd.read_csv('train.csv')
# X_test = pd.read_csv('test.csv')

# X = preprocess(X)
# X = new_features(X)

# X_test = new_features(X_test)

# y = X['Target']
# X_train = X.drop(columns='Target')

# classifier = CatBoostClassifier(
#     n_estimators=1000,
#     depth=3,  # Adjusted depth for better learning capacity
#     loss_function='MultiClass',
#     l2_leaf_reg=0.5555555,
#     verbose=False,
#     learning_rate=0.1,
#     cat_features=cat_features,
#     eval_metric='Accuracy',
#     task_type='GPU',
#     devices='0-7'
# )

# classifier.fit(X_train, y)
# pred = classifier.predict(X_test)

In [None]:
target_dict = {
        1: 'Enrolled',
        0: 'Dropout',
        2: 'Graduate',
    }
reversed = np.vectorize(target_dict.get)
y_pred = reversed(pred)
y_pred = y_pred.ravel()

In [None]:
X_test['Target'] = y_pred
X_test[['id', 'Target']].to_csv('submission.csv', index=False)

In [None]:
# ! kaggle competitions submit -c playground-series-s4e6 -f submission.csv -m "1"