# Model Training

## Import required libraries

In [99]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, roc_auc_score

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import xgboost as xgb

In [114]:
# Define the input data file and model artifact paths
data_file = "data/Assignment-2_Data.csv"  
artifact_base_path = "artifacts"
artifact_file = f"{artifact_base_path}/model.bin"
target = 'y'

# List of models with their configurations
model_infos = [
  {
    "name": "LR",
    "model": LogisticRegression(),
    "params": {
      "solver": ['liblinear'],
      "C": [1.0],
      "max_iter": [10, 50, 100]
    }
  },
  {
    "name": "DecisionTreeClassifier",
    "model": DecisionTreeClassifier(),
  },
  {
    "name": "RandomForestClassifier",
    "model": RandomForestClassifier(),
  },
  {
    "name": "XGBClassifier",
    "model": xgb.XGBClassifier(
      objective="binary:logistic",  # For binary classification
      random_state=42,              # Random seed for reproducibility
      n_jobs=-1,                    # Used for parallel processing, It will use all available CPU's
      use_label_encoder=False
    ),
    "params": {
      "n_estimators": [200],         # Number of boosting rounds
      "learning_rate": [0.1],    # Learning rate
      "max_depth": [5],                 # Maximum depth of each tree
    }
  },
  {
    "name": "LDA",
    "model": LinearDiscriminantAnalysis()
  },
  {
    "name": "NB",
    "model": GaussianNB()
  }
]




## Read Dataset

* Read Dataframe by using `pandas`
* Remove null values
* Remove not required fields
* Remove `age` outliers

In [95]:
def read_data(data_file):
  """
  Read the input data from a CSV file.

  Args:
    data_file (str): Path to the input data file.

  Returns:
    pd.DataFrame: A pandas DataFrame containing the input data.
  """
  df = pd.read_csv(data_file)
  df.dropna(inplace=True)
  not_required_field = [ "Id", "default" ]
  df.drop(columns=not_required_field, inplace=True)
  indexes_to_drop = df[(df['age'] == 999.0) | (df['age'] == -1)].index
  df.drop(indexes_to_drop, inplace=True)
  return df

In [96]:
df = read_data("data/Assignment-2_Data.csv")

In [97]:
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
1,44.0,technician,single,secondary,29.0,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33.0,entrepreneur,married,secondary,2.0,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47.0,blue-collar,married,unknown,1506.0,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33.0,unknown,single,unknown,1.0,no,no,unknown,5,may,198,1,-1,0,unknown,no
5,35.0,management,married,tertiary,231.0,yes,no,unknown,5,may,139,1,-1,0,unknown,no


## Split Dataframe into `train`, `val` and `test` sets

In [98]:
def split_dataframe(df):
  """
  Split the input dataframe into train, validation and test sets
  
  Args:
    df (pd.DataFrame): Input DataFrame.
  
  Returns:
    pd.DataFrame: DataFrames for training, validation, and test sets.
  """
  df[target] = df[target].replace({'yes': 1, 'no': 0})
  df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
  df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

  df_train = df_train.reset_index(drop=True)
  df_val = df_val.reset_index(drop=True)
  df_test = df_test.reset_index(drop=True)

  y_train = df_train[target].values
  y_val = df_val[target].values
  y_test = df_test[target].values

  del df_train[target]
  del df_val[target]
  del df_test[target]
  return df_train, y_train, df_val, y_val, df_test, y_test

In [100]:
df_train, y_train, df_val, y_val, df_test, y_test = split_dataframe(df)

## Preprocess dataset

* Convert dataframe into dict
* Apply one hot coding by using  `DictVectorizer`
* Apply normalization by using `StandardScaler`

In [105]:
def pre_process_data(df_train, df_val, df_test):
  """
  Preprocess the data including one-hot encoding and standardization.

  Args:
    df_train (pd.DataFrame): Training data.
    df_val (pd.DataFrame): Validation data.
    df_test (pd.DataFrame): Test data.

  Returns:
    Tuple: Tuple containing the DictVectorizer, StandardScaler, and preprocessed data.
  """
  dict_train = df_train.to_dict(orient='records')
  dict_val = df_val.to_dict(orient='records')
  dict_test = df_test.to_dict(orient='records')

  dv = DictVectorizer(sparse=False)
  X_train = dv.fit_transform(dict_train)
  X_val = dv.transform(dict_val)
  X_test = dv.transform(dict_test)

  scaler = StandardScaler()
  X_train = scaler.fit_transform(X_train)
  X_val = scaler.transform(X_val)
  X_test = scaler.transform(X_test)

  return dv, scaler, X_train, X_val, X_test

In [106]:
dv, scaler, X_train, X_val, X_test = pre_process_data(df_train, df_val, df_test)

## Resample train dataset

In [101]:
df['y'].value_counts()

y
0    39910
1     5286
Name: count, dtype: int64

So here our dataset is unbalnced dataset, To make it balanced dataset we should use `SMOTE` for training dataset only

In [103]:
def resample_train_data(X_train, y_train):
  """
  Apply SMOTE to balance the training data

  Args:
    X_train
    y_train

  Returns:
    Tuple: containing the X_resampled, y_resampled
  """
  smote = SMOTE(sampling_strategy="auto", random_state=42)
  X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
  return X_resampled, y_resampled

In [108]:
# X_resampled, y_resampled = resample_train_data(X_train, y_train)

In [109]:
# Smote is not working in notebook version, but its working into train.py file
X_resampled = X_train
y_resampled = y_train

In [110]:
def get_best_params_and_estimator(model_info, X_train, y_train, X_val, y_val):
  """
  Get best hyperparameters and estimator by using GridSearchCV

  Args:
    model_info: Model information containing model name, model object, set of parameters
    X_train: preprocessed training data set
    y_train: training label
    X_val: preprocessed validation data set
    y_val: validation label

  Returns:
    Tuple: Tuple containing best hyper parameters and best estimators
  """
  model = model_info['model']
  model_name = model_info['name']
  params = model_info.get('params', {})

  grid_search = GridSearchCV(model, params, cv=5)
  grid_search.fit(X_train, y_train)
  best_params = grid_search.best_params_
  best_estimator = grid_search.best_estimator_
  return best_params, best_estimator

In [111]:
def get_model_evaluation(model, X, y):
  """
  Get model evaluation report, It helps us to understand how our model is behaving

  Args:
    model: Model on which we have to evaluate model accuracy
    X: Dataset on which we will evaluate model accuracy
    y: Actual label (Truth label values)
  """
  y_pred = model.predict(X)
  acc = roc_auc_score(y, y_pred)
  return round(acc, 4)

In [112]:
def get_model_reports(model_infos, X_train, y_train, X_val, y_val):
  """
  Get the model report based on provided multiple model information

  Args:
    model_infos: Multiple model information
    X_train, y_train: Training dataset
    X_val, y_val: Validatioin dataset

  Returns:
    List: List of model reports containing, model name, model object, accuracy, hyperparameters
  """
  model_reports = []
  for model_info in model_infos:
    best_params, best_estimator = get_best_params_and_estimator(model_info, X_train, y_train, X_val, y_val)
    train_accuracy = get_model_evaluation(best_estimator, X_train, y_train)
    val_accuracy = get_model_evaluation(best_estimator, X_val, y_val)
    
    print(f"Model: {model_info['name']}, train_accuracy: {train_accuracy}, validation accuracy: {val_accuracy}")
    
    model_report = {
      "name": model_info["name"],
      "model": best_estimator,
      "best_params": best_params,
      "train_accuracy": train_accuracy,
      "val_accuracy": val_accuracy
    }
    model_reports.append(model_report)

  return model_reports

In [115]:
model_report = get_model_reports(model_infos, X_resampled, y_resampled, X_val, y_val)

Model: LR, train_accuracy: 0.6484, validation accuracy: 0.6722
Model: DecisionTreeClassifier, train_accuracy: 1.0, validation accuracy: 0.6934
Model: RandomForestClassifier, train_accuracy: 1.0, validation accuracy: 0.6954




Model: XGBClassifier, train_accuracy: 0.7874, validation accuracy: 0.7162
Model: LDA, train_accuracy: 0.6958, validation accuracy: 0.7173
Model: NB, train_accuracy: 0.6956, validation accuracy: 0.6995


In [116]:
def get_best_model_info(model_reports):
  """
  Get best model based on accuracy

  Args:
    model_report: List of model reports

  Returns:
    best_model: Best model based on model accuracy
  """
  sorted_report = sorted(model_reports, key=lambda x: x['val_accuracy'], reverse=True)
  return sorted_report[0]

In [117]:
best_model_info = get_best_model_info(model_report)

### Save artifacts

In [121]:
def save_artifacts(dictVectorizer, standardScaler, model, model_file):
  """
  Save artifacts, that can be used in web server to generate prediction

  Args:
    dictVectorizer: One hot encoder
    standardScaler: Standard Scaler
    model: Best model
    model_file: File name, in which model should be stored
  """
  pipeline = make_pipeline(dictVectorizer, standardScaler, model)
  with open(model_file,'wb') as f_out: 
    pickle.dump(pipeline, f_out)


In [122]:
best_model = best_model_info["model"]
save_artifacts(dv, scaler, best_model, artifact_file)

In [123]:
best_model