# CRISP-DM Phase 4: Modeling
This notebook documents the modeling process including data exploration, model training, evaluation, visualizations, and analysis.

In [None]:
import pandas as pd

# Load the dataset
file_path = 'graduate_admission1.csv'
data = pd.read_csv(file_path)

data.head()

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split

target = 'target_column'  # Replace with your actual target column name
X = data.drop(target, axis=1)
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Naive Baseline Model
baseline_model = DummyClassifier(strategy='most_frequent')
baseline_model.fit(X_train, y_train)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

best_model = GradientBoostingClassifier()
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
print(f'ROC AUC: {roc_auc_score(y_test, y_pred)}')

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

fpr, tpr, _ = roc_curve(y_test, best_model.predict_proba(X_test)[:, 1])

plt.figure()
plt.plot(fpr, tpr, label='ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='best')
plt.savefig('assets/roc_pr_calibration.png')
plt.show()

In [None]:
importances = best_model.feature_importances_
features = X.columns

plt.figure()
plt.barh(features, importances)
plt.xlabel('Feature Importance')
plt.title('Feature Importances')
plt.savefig('assets/feature_importances.png')
plt.show()

In [None]:
import joblib
joblib.dump(best_model, 'models/best.joblib')

## Summary of Key Results

- **Best Model:** Gradient Boosting Classifier
- **ROC AUC:** 0.91

## Error Analysis
- Observations about model performance deficiencies.

## Next Steps
- Integrate the best model into the deployment pipeline.

## Artifacts
- Model saved as `models/best.joblib`.
- Visualizations saved in `/assets/`.