**Import necessary libraries**

In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score


**Load the training data and assign column names**

In [2]:
# Define feature names
feature_names = ['MOSTYPE' , 'MAANTHUI' , 'MGEMOMV' , 'MGEMLEEF' , 'MOSHOOFD' , 'MGODRK' , 'MGODPR' , 'MGODOV' , 'MGODGE' , 'MRELGE' , 'MRELSA' , 'MRELOV' , 'MFALLEEN' , 'MFGEKIND' , 'MFWEKIND' , 'MOPLHOOG' , 'MOPLMIDD' , 'MOPLLAAG' , 'MBERHOOG' , 'MBERZELF' , 'MBERBOER' , 'MBERMIDD' , 'MBERARBG' , 'MBERARBO' , 'MSKA' , 'MSKB1' , 'MSKB2' , 'MSKC' , 'MSKD' , 'MHHUUR' , 'MHKOOP' , 'MAUT1' , 'MAUT2' , 'MAUT0' , 'MZFONDS' , 'MZPART' , 'MINKM30' , 'MINK3045' , 'MINK4575' , 'MINK7512' , 'MINK123M' , 'MINKGEM' , 'MKOOPKLA' , 'PWAPART' , 'PWABEDR' , 'PWALAND' , 'PPERSAUT' , 'PBESAUT' , 'PMOTSCO' , 'PVRAAUT' , 'PAANHANG' , 'PTRACTOR' , 'PWERKT' , 'PBROM' , 'PLEVEN' , 'PPERSONG' , 'PGEZONG' , 'PWAOREG' , 'PBRAND' , 'PZEILPL' , 'PPLEZIER' , 'PFIETS' , 'PINBOED' , 'PBYSTAND' , 'AWAPART' , 'AWABEDR' , 'AWALAND' , 'APERSAUT' , 'ABESAUT' , 'AMOTSCO' , 'AVRAAUT' , 'AAANHANG' , 'ATRACTOR' , 'AWERKT' , 'ABROM' , 'ALEVEN' , 'APERSONG' , 'AGEZONG' , 'AWAOREG' , 'ABRAND' , 'AZEILPL' , 'APLEZIER' , 'AFIETS' , 'AINBOED' , 'ABYSTAND' , 'CARAVAN']

# Load training data with assigned column names
train_data = pd.read_csv("https://github.com/szbela87/ml_22_elteik/raw/main/mp2/ticdata2000.txt", sep='\t', header=None, names=feature_names)


**Preprocess the data**

In [3]:
# Split features and target variable
X = train_data.drop(columns=['CARAVAN'])
y = train_data['CARAVAN']


**Train-test split**

In [4]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


**Train the Gradient Boosting model**

In [5]:
# Initialize Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier(random_state=42)

# Train the model
gb_classifier.fit(X_train, y_train)


**Evaluate the model**

In [6]:
# Make predictions on the test set
y_pred = gb_classifier.predict(X_test)

# Calculate balanced accuracy
bal_acc = balanced_accuracy_score(y_test, y_pred)
print("Balanced Accuracy:", bal_acc)


Balanced Accuracy: 0.5091168967245228


**Hyperparameter tuning using GridSearchCV**

In [9]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Initialize Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=5, scoring='balanced_accuracy')

# Perform grid search to find the best hyperparameters
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best model obtained from grid search
best_gb_classifier = grid_search.best_estimator_


Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 200}


**Make predictions and calculate balanced accuracy**

In [11]:
# Make predictions on the test set using the best model
y_pred = best_gb_classifier.predict(X_test)

# Calculate balanced accuracy
bal_acc = balanced_accuracy_score(y_test, y_pred)
print("Balanced Accuracy:", bal_acc)


Balanced Accuracy: 0.5273506901735684


**Prepare submission file**

In [14]:
# Load the test data
test_data = pd.read_csv("https://github.com/szbela87/ml_22_elteik/raw/main/mp2/ticeval2000.txt", sep='\t', header=None, names=feature_names[:-1])

# Make predictions on the test set using the best model
test_predictions = best_gb_classifier.predict(test_data)

# Save predictions to a submission file
submission_filename = "mldl_competition2_Mohammed_Aymen_Amimoussa_submission2.txt"
with open(submission_filename, 'w') as f:
    for pred in test_predictions:
        f.write(f"{pred}\n")

print("Submission file created:", submission_filename)


Submission file created: mldl_competition2_Mohammed_Aymen_Amimoussa_submission2.txt
