In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load data
train_df = pd.read_csv('train.csv')

# Drop 'id' and separate features/target
X = train_df.drop(columns=['id', 'Target'])
y = train_df['Target']

# Optional: Encode categorical features if needed (none here except 'Target')
# If Target is categorical (object), encode it
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split into train/validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Train Decision Tree
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred, target_names=le.classes_))

# Load test set and sample submission
test_df = pd.read_csv('test.csv')
submission_df = pd.read_csv('sample_submission.csv')

# Drop 'id' and use same features as train
X_test = test_df.drop(columns=['id'])

# Predict
test_preds_encoded = clf.predict(X_test)
test_preds = le.inverse_transform(test_preds_encoded)

# Prepare submission
submission_df['Target'] = test_preds
submission_df.to_csv('decision_tree_submission.csv', index=False)
print("Submission saved as 'decision_tree_submission.csv'")


Validation Accuracy: 0.742093570308416

Classification Report:
               precision    recall  f1-score   support

     Dropout       0.78      0.79      0.79      5028
    Enrolled       0.48      0.48      0.48      3017
    Graduate       0.83      0.81      0.82      7259

    accuracy                           0.74     15304
   macro avg       0.70      0.70      0.70     15304
weighted avg       0.74      0.74      0.74     15304

Submission saved as 'decision_tree_submission.csv'
