!pip install xgboost
!pip install scikit-learn

In [2]:
from datetime import datetime 

In [3]:
import   pandas                   as        pd
import   numpy                    as        np
import   matplotlib.pyplot        as        plt
import   sklearn.metrics          as        metrics

In [4]:
from    sklearn.svm               import    SVC
from    sklearn.ensemble          import    RandomForestClassifier, AdaBoostClassifier 
from    sklearn.model_selection   import    StratifiedKFold
from    sklearn.model_selection   import    StratifiedGroupKFold
from    sklearn.model_selection   import    RandomizedSearchCV
from    sklearn.linear_model      import    LogisticRegression
from    sklearn.tree              import    DecisionTreeClassifier
from    sklearn.neighbors         import    KNeighborsClassifier
from    sklearn.naive_bayes       import    GaussianNB   
from    xgboost                   import    XGBClassifier  
from    sklearn.metrics           import    make_scorer, precision_score, recall_score, f1_score
from    sklearn.metrics           import    confusion_matrix, classification_report
from    sklearn.model_selection   import    train_test_split,cross_val_score,cross_val_predict 
import warnings
warnings.filterwarnings("ignore")


In [5]:
def evaluate_model_performance(name, model, x_train, y_train, x_test, y_test):
    from sklearn.metrics import recall_score, precision_score,roc_auc_score,f1_score  
    # Predict the response for training dataset
    classifier         =  model.fit(x_train, y_train)
    # print the model parameters
    print("Model parameters: {}".format(model.get_params()))
    predicted_train    =  classifier.predict(x_train)
    matrix             =  confusion_matrix(y_train, predicted_train)
    ###
    train_auroc        =  roc_auc_score(y_train, predicted_train)
    train_recall       =  recall_score(y_train, predicted_train)
    train_precision    =  precision_score(y_train, predicted_train)
    train_f1score      =  f1_score(y_train, predicted_train, average = 'weighted')
    ###
    print("\nTraining Data")
    print(matrix)
    draw_cm(y_train, predicted_train)
    measures_train      = classification_report(y_train, predicted_train) 
    print("\nTraining Data")
    print(measures_train) 
    draw_roc(y_train, predicted_train)
    # Predict the response for testing dataset
    predicted_test     =  classifier.predict(x_test)
    matrix1            =  confusion_matrix(y_test, predicted_test)
    ### 
    test_auroc         =  roc_auc_score(y_test, predicted_test)
    test_recall        =  recall_score(y_test, predicted_test)
    test_precision     =  precision_score(y_test, predicted_test)
    test_f1score       =  f1_score(y_test, predicted_test, average = 'weighted')
    ###    
    print("\nTest  Data")
    print(matrix1)
    draw_cm(y_test, predicted_test)
    measures_test     = classification_report(y_test, predicted_test) 
    print("\nTest  Data")
    print(measures_test) 
    draw_roc(y_test, predicted_test) 
    df_metrics    =  pd.DataFrame({'Model' : name, 'Recall Training data' : train_recall, 'Recall Test data' : test_recall,\
                              'F1 Weighted Training data' : train_f1score, 'F1 Weighted Test data' : test_f1score,
                              'AUROC Training data' : train_auroc, 'AUROC Test data' : test_auroc,
                              'Precision Training data' : train_precision, 'Precision Test data' : test_precision},\
                              index = [0]) 
    return df_metrics

In [6]:
def draw_cm( actual, predicted ):
    
    import matplotlib.pyplot as   plt
    import sklearn.metrics   as   metrics
    import seaborn           as   sns
    
    cm = metrics.confusion_matrix( actual, predicted)
    sns.heatmap(cm, annot=True, fmt='.2f', xticklabels = ["Yes", "No"] , yticklabels = ["Yes", "No"] )
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()  

In [7]:
def draw_roc( actual, probs ):
    
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
    drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(6, 4))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic curve')
    plt.legend(loc="lower right")
    plt.show()
    return fpr, tpr, thresholds

In [8]:
! pip install dictdiffer



In [9]:
from dictdiffer import diff

In [10]:
performance_hyper_df       =    pd.DataFrame()

In [11]:
start_time   =  datetime.now()

In [12]:
'''
We need to know what changed after getting the best parameters by performing hyper parameter tuning,
'''
def  what_changed(model_name, first_dict, second_dict):
### First dict 
       first_dictKeys = list(first_dict.keys())
       first_dictKeys.sort()
       first_sorted_dict = {i: first_dict[i] for i in first_dictKeys}
### Second dict 
       second_dictKeys = list(second_dict.keys())
       second_dictKeys.sort()
       second_sorted_dict = {i: second_dict[i] for i in second_dictKeys}

       result = diff(first_sorted_dict, second_sorted_dict)
       print("Model name {}".format(model_name))
       return list(result)

In [13]:
def  metrics_graph(df, model_names):
     print(df.columns)
     scores_req   =   ['roc_auc', 'precision', 'recall', 'F1 Weighted']
     for       scoring in   scores_req :
               df1      =  df.loc[df['Score'] == scoring, ['Name', 'Results']]
               df1.plot.box(column = "Results", by = "Name", figsize=(8, 6), grid=False, rot=90, fontsize = 15)
               txt      =  "Model performance using " + scoring
               plt.title(txt)
               plt.show()

In [14]:
def get_default(model, X, y):
    model.fit(X, np.ravel(y,order="c"))
    print(model.get_params())
    return model.get_params()

#### Define dataframes

In [16]:
df1_0    =   pd.DataFrame({'Model' : 'LR', 'roc_auc' : 0, 'precision' : 0, 'recall' : 0, 'F1 Weighted' :0}, index = [0])
df1_0.reset_index()
dict = {'Model' : 'KNN', 'roc_auc' : 0, 'precision' : 0, 'recall' : 0, 'F1 Weighted' :0}
df12 = pd.DataFrame(dict, index = [0])
#
dict = {'Model' : 'CART', 'roc_auc' : 0, 'precision' : 0, 'recall' : 0, 'F1 Weighted' :0}
df13 = pd.DataFrame(dict, index = [0])
#
dict = {'Model' : 'NB', 'roc_auc' : 0, 'precision' : 0, 'recall' : 0, 'F1 Weighted' :0}
df14  = pd.DataFrame(dict, index = [0])
#
dict = {'Model' : 'RF', 'roc_auc' : 0, 'precision' : 0, 'recall' : 0, 'F1 Weighted' :0}
df15  = pd.DataFrame(dict, index = [0])
#
dict = {'Model' : 'XGBoost', 'roc_auc' : 0, 'precision' : 0, 'recall' : 0, 'F1 Weighted' :0}
df16  = pd.DataFrame(dict, index = [0])
#
dict = {'Model' : 'AdaBoost', 'roc_auc' : 0, 'precision' : 0, 'recall' : 0, 'F1 Weighted' :0}
df17  = pd.DataFrame(dict, index = [0])
##
df17_ = pd.concat([df1_0, df12, df13, df14, df15, df16, df17], ignore_index = True)
#

df_measures_untuned  =  df17_.copy()
df_measures_untuned.set_index(['Model'], inplace = True)
print(df_measures_untuned)

          roc_auc  precision  recall  F1 Weighted
Model                                            
LR              0          0       0            0
KNN             0          0       0            0
CART            0          0       0            0
NB              0          0       0            0
RF              0          0       0            0
XGBoost         0          0       0            0
AdaBoost        0          0       0            0


In [17]:
df_0    =   pd.DataFrame({'Model' : 'LR', 'roc_auc' : 0, 'precision' : 0, 'recall' : 0, 'F1 Weighted' :0}, index = [0])
df_0.reset_index()
dict = {'Model' : 'KNN', 'roc_auc' : 0, 'precision' : 0, 'recall' : 0, 'F1 Weighted' :0}
df2 = pd.DataFrame(dict, index = [0])
#
dict = {'Model' : 'CART', 'roc_auc' : 0, 'precision' : 0, 'recall' : 0, 'F1 Weighted' :0}
df3 = pd.DataFrame(dict, index = [0])
#
dict = {'Model' : 'NB', 'roc_auc' : 0, 'precision' : 0, 'recall' : 0, 'F1 Weighted' :0}
df4  = pd.DataFrame(dict, index = [0])
#
dict = {'Model' : 'RF', 'roc_auc' : 0, 'precision' : 0, 'recall' : 0, 'F1 Weighted' :0}
df5  = pd.DataFrame(dict, index = [0])
#
dict = {'Model' : 'XGBoost', 'roc_auc' : 0, 'precision' : 0, 'recall' : 0, 'F1 Weighted' :0}
df6  = pd.DataFrame(dict, index = [0])
#
dict = {'Model' : 'AdaBoost', 'roc_auc' : 0, 'precision' : 0, 'recall' : 0, 'F1 Weighted' :0}
df7  = pd.DataFrame(dict, index = [0])
##
df7_ = pd.concat([df_0, df2, df3, df4, df5, df6, df7], ignore_index = True)
#

df_measures_tuned  =  df7_.copy()
df_measures_tuned.set_index(['Model'], inplace = True)
print(df_measures_tuned)

          roc_auc  precision  recall  F1 Weighted
Model                                            
LR              0          0       0            0
KNN             0          0       0            0
CART            0          0       0            0
NB              0          0       0            0
RF              0          0       0            0
XGBoost         0          0       0            0
AdaBoost        0          0       0            0


In [18]:
model_names  =  ["LR","KNN","CART","NB","RF","XGBoost","AdaBoost"]
models       =  [LogisticRegression(max_iter = 3000),\
                 KNeighborsClassifier(),\
                 DecisionTreeClassifier(),\
                 GaussianNB(),\
                 RandomForestClassifier(),\
                 XGBClassifier(),\
                 AdaBoostClassifier()]

In [19]:
X_            =   pd.read_csv(r'C:\Users\chitralekha\Desktop\Great Learning\DSE-FT-CHN-MAY24-G5-Final_Report\processed_data_x2024-12-03 09-53-44.csv')
y             =   pd.read_csv(r'C:\Users\chitralekha\Desktop\Great Learning\DSE-FT-CHN-MAY24-G5-Final_Report\processed_data_y2024-12-03 09-53-44.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\chitralekha\\Desktop\\Great Learning\\DSE-FT-CHN-MAY24-G5-Final_Report\\processed_data_x2024-12-03 09-53-44.csv'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_, y, test_size = 0.2, shuffle = True, stratify = y, random_state = 100)

### A)  Before SMOTE

In [None]:
metrics_df    =  pd.DataFrame()
for   i    in   range(len(model_names)):
      model_x            =   model_names[i]
      print("i = %d Model = %s" %(i,model_x))
      classifier         =   models[i] 
      mp_df         =  evaluate_model_performance(model_x, classifier, X_train, y_train, X_test, y_test)
      metrics_df    =  pd.concat([metrics_df, mp_df])

In [None]:
metrics_df.sort_values(by=['Recall Test data'], ascending = False)

In [None]:
time_elapsed = datetime.now() - start_time
print('\nExecution Time for evaluating the performance of 7 models on Raw data not treated for data imbalance')
print('Time elapsed (hh:mm:ss.ms) {}'.format(time_elapsed))

### Synthetic Minority Oversampling TEchnique (SMOTE)

A problem with imbalanced classification is that there are too few examples of the minority class for a model to effectively learn the decision boundary.

One way to solve this problem is to **oversample** the examples in the minority class. This can be achieved by simply duplicating examples from the minority class in the training dataset prior to fitting a model. This can balance the class distribution but does not provide any additional information to the model.

An improvement on duplicating examples from the minority class is to synthesize new examples from the minority class. This is a type of data augmentation for tabular data and can be very effective.

Perhaps the most widely used approach to synthesizing new examples is called the Synthetic Minority Oversampling TEchnique, or SMOTE for short. This technique was described by Nitesh Chawla, et al. in their 2002 paper named for the technique titled “SMOTE: Synthetic Minority Over-sampling Technique.”

SMOTE works by selecting examples that are close in the feature space, drawing a line between the examples in the feature space and drawing a new sample at a point along that line.

Specifically, a random example from the minority class is first chosen. Then k of the nearest neighbors for that example are found (typically k=5). A randomly selected neighbor is chosen and a synthetic example is created at a randomly selected point between the two examples in feature space.

**Undersampling** is a technique to balance uneven datasets by keeping all of the data in the minority class and decreasing the size of the majority class. It is one of several techniques data scientists can use to extract more accurate information from originally imbalanced datasets.

Undersampling can result in the loss of relevant information by removing valuable and significant patterns.
Undersampling is appropriate when there is plenty of data for an accurate analysis. The data scientist uses all of the rare events but reduces the number of abundant events to create two equally sized classes.

We have **4062 (2.25%) observations for the minority class** and 176457 (97.75%) observations for the majority class. 
So, it was decided to go for **Oversampling** method.

### b) After SMOTE

In [None]:
start_time   =  datetime.now()

For appling SMOTE, we have splitted the data into training and test datasets in the ratio, 80%: 20% and applied SMOTE only on Training data.

As we need to apply SMOTE only on Training data, we have not applied K Fold cross validation.

In [None]:
from imblearn.over_sampling import SMOTE

# transform the dataset
oversample = SMOTE()

In [None]:
X_train_SMOTE, y_train_SMOTE = oversample.fit_resample(X_train, y_train)

In [None]:
SMOTE_metrics_df =  pd.DataFrame()
for   i    in   range(len(model_names)):
      model_x            =   model_names[i]
      print("i = %d Model = %s" %(i,model_x))
      classifier         =   models[i] 
      smote_df = evaluate_model_performance(model_x, classifier, X_train_SMOTE, y_train_SMOTE, X_test, y_test)
      SMOTE_metrics_df =  pd.concat([SMOTE_metrics_df,smote_df])

In [None]:
SMOTE_metrics_df.sort_values(by=['Recall Test data'], ascending = False)

In [None]:
time_elapsed = datetime.now() - start_time
print('\nExecution Time for evaluating the performance of 8 models on the data treated for data imbalance')
print('Time elapsed (hh:mm:ss.ms) {}'.format(time_elapsed))

### Observations

Without applying SMOTE technique to treat the data imbalance, all the models are performing well with the recall values on both training and test datasets are 99% or above.

After applying SMOTE technique to treat the data imbalance, all the models are performing well with the recall values on both training and test datasets are 99% or above. Recall values of all the models are 100%.

### KFold cross validation

In [None]:
def evaluate_model_performance_KF(name, model, X, y):
    from  sklearn.model_selection import  StratifiedKFold
    from  sklearn.model_selection import  cross_val_score

    # Predict the response for training dataset
    classifier   =  model.fit(X, y)
    scores_req   =  ['roc_auc', 'precision', 'recall', 'F1 Weighted']
    df_model     =  pd.DataFrame()
    for scoring in scores_req:
        score_fn  =  scoring         

        print("\nPerformance Measure : %s" %scoring)

        if scoring == 'F1 Weighted':
            score_fn  =  make_scorer (f1_score, average = 'weighted', zero_division = 0)

        skf           =   StratifiedKFold(n_splits = 10, shuffle = True, random_state = 12345)
        cv_results    =   cross_val_score(model, X, np.ravel(y,order="c"), cv = skf, scoring = score_fn)
        msg = "%s" % (str(round(cv_results.mean(),2)) + " ± " + str(round(cv_results.std(),2)))
        print(msg)
        df_measures_tuned.at[name, scoring] = round(cv_results.mean(),2)
        indices_      =   list(range(1, 11))
        df_ind        =   pd.DataFrame({'Score' : scoring, 'Name' : name, 'Resuts' : cv_results}, index = indices_)
        df_model      =   pd.concat([df_model, df_ind]) 
    xdf           = df_model.groupby(["Name", "Score"]).agg([np.mean, np.std])
    return xdf    

In [None]:
Kfold_metrics_df  =  pd.DataFrame()

In [None]:
for   i    in   range(len(model_names)):
      model_x            =   model_names[i]
      classifier         =   models[i] 
      print("i = %d Model = %s" %(i,model_x))
      KFold_df           =   evaluate_model_performance_KF(model_x, classifier, X_, y)
      Kfold_metrics_df   =   pd.concat([Kfold_metrics_df, KFold_df])

In [None]:
Kfold_metrics_df

### We need to choose the best model. 

### Do we need tune the hyper parameters of all the models and again compare?