#### Classification Model : Predicts **Delay** in Research Paper Completion

---

In [None]:
import pandas
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import joblib
import json

Fetch the dummy data CSV & separate I/O stuff

In [None]:
data = pandas.read_csv("/content/phd_delay_data.csv")
x = data.drop(columns = ['delayed'])
y = data['delayed']
feat = data.columns.tolist()  #To save in metadata later

Cross validation > Training testing (Prevents overfitting)

Hyperparameter tuning via Grid search finds best combo by trial and error

In [None]:
model = RandomForestClassifier(random_state = 69)

# Stratified K Fold keeps the class ratio constant across folds
crossval = StratifiedKFold(shuffle = True, random_state = 69)

hyperpara = {'max_depth': [5,10,15],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']}

scoring = {'accuracy': 'accuracy',
    'recall': 'recall',
    'precision': 'precision',
    'f1': 'f1',}

grid = GridSearchCV(
    estimator = model,
    param_grid = hyperpara,
    scoring = scoring,
    refit = 'recall', # type: ignore
    cv = crossval,
    n_jobs = -1,
    verbose = 2
)

In [None]:
grid.fit(x,y)
print(grid.best_params_)
print(grid.best_score_)

Time to save the trained model as .pkl using Joblib

And the necessary metadata as .json using (obv) Json

In [None]:
joblib.dump(grid.best_estimator_, "research_delay_model.pkl")

metadata = {
    'labels' : {
    0 : 'On Time',
    1 : 'Delayed' },
    'features' : feat,
    'model' : 'Random Forest Classifier' }

with open("research_delay_metadata.json",'w') as dafile:
  json.dump(metadata, dafile, indent = 4)