In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.base import BaseEstimator


pd.set_option("display.max_columns",None)



In [2]:
train = pd.read_csv("../1_dataset/train.csv")
test = pd.read_csv("../1_dataset/test.csv")
submission = pd.read_csv("../1_dataset/sample_submission.csv")

In [7]:
train.head()

Unnamed: 0,id,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,b,t,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,0,22.0,3.0,1.0,2.0,60.0,278.63,0.06,19.56,14.25,5448.79,0.09,302.71,17,1,1,0,16.0,9.0,38.0,22.0,5.0,False
1,1,14.0,2.0,1.0,2.0,32.0,151.27,0.14,7.0,21.11,936.71,0.05,52.04,11,0,1,0,11.0,11.0,18.0,14.0,3.0,False
2,2,11.0,2.0,1.0,2.0,45.0,197.65,0.11,8.05,22.76,1754.01,0.07,97.45,8,0,1,0,12.0,11.0,28.0,17.0,3.0,False
3,3,8.0,1.0,1.0,1.0,23.0,94.01,0.19,5.25,17.86,473.66,0.03,26.31,4,0,2,0,8.0,6.0,16.0,7.0,1.0,True
4,4,11.0,2.0,1.0,2.0,17.0,60.94,0.18,5.63,12.44,365.67,0.02,20.31,7,0,2,0,7.0,6.0,10.0,10.0,3.0,False


In [12]:


# Dropping the 'id' column
data = train.drop(columns=['id'])

# Checking for missing values and imputing with mean for numerical columns
data.fillna(data.mean(), inplace=True)

# Splitting the target variable 'defects' from the features
X = data.drop(columns=['defects'])
y = data['defects']

# Splitting the dataset into training and validation sets
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_validation_scaled = scaler.transform(X_validation)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_validation_scaled = pd.DataFrame(X_validation_scaled, columns=X_validation.columns)

In [13]:

# Function to train a model and make predictions
def model_predict(model: BaseEstimator, X_train: np.ndarray, y_train: np.ndarray, 
                  X_validation: np.ndarray, y_validation: np.ndarray):
    """
    Train the model and make predictions on both training and validation datasets.

    Parameters:
    model (BaseEstimator): An instance of a scikit-learn model.
    X_train (np.ndarray): Training data features.
    y_train (np.ndarray): Training data target variable.
    X_validation (np.ndarray): Validation data features.
    y_validation (np.ndarray): Validation data target variable.

    Returns:
    tuple: Predictions for training and validation datasets.
    """
    # Fitting the model
    model.fit(X_train, y_train)

    # Making predictions
    predictions_train = model.predict(X_train)
    predictions_validation = model.predict(X_validation)

    return predictions_train, predictions_validation

# Function to calculate the ROC AUC score
def calculate_roc_auc(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """
    Calculate the ROC AUC score.

    Parameters:
    y_true (np.ndarray): The true target values.
    y_pred (np.ndarray): The predicted target values.

    Returns:
    float: The ROC AUC score.
    """
    return roc_auc_score(y_true, y_pred)



In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd

# List of models to be evaluated
models = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    SVC(probability=True),  # SVC needs probability=True for ROC AUC
    KNeighborsClassifier()
]


# Reinitializing the summary table
summary_table = pd.DataFrame(columns=['Model Name', 'ROC AUC on Training Set', 'ROC AUC on Validation Set'])

# Evaluating each model
for model in models:
    # Train the model and make predictions
    predictions_train, predictions_validation = model_predict(model, X_train_scaled, y_train, X_validation_scaled, y_validation)
    
    # Calculate ROC AUC scores
    roc_auc_train = calculate_roc_auc(y_train, predictions_train)
    roc_auc_validation = calculate_roc_auc(y_validation, predictions_validation)
    
    # Append the results to the summary table using pandas.concat
    summary_table = pd.concat([
        summary_table, 
        pd.DataFrame({
            'Model Name': [model.__class__.__name__],
            'ROC AUC on Training Set': [roc_auc_train],
            'ROC AUC on Validation Set': [roc_auc_validation]
        })
    ], ignore_index=True)

summary_table



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Unnamed: 0,Model Name,ROC AUC on Training Set,ROC AUC on Validation Set
0,LogisticRegression,0.619305,0.617055
1,DecisionTreeClassifier,1.0,0.608308
2,RandomForestClassifier,1.0,0.653693
3,GradientBoostingClassifier,0.670303,0.664816
4,SVC,0.652664,0.644003
5,KNeighborsClassifier,0.725084,0.643263


In [16]:
summary_table


Unnamed: 0,Model Name,ROC AUC on Training Set,ROC AUC on Validation Set
0,LogisticRegression,0.619305,0.617055
1,DecisionTreeClassifier,1.0,0.608308
2,RandomForestClassifier,1.0,0.653693
3,GradientBoostingClassifier,0.670303,0.664816
4,SVC,0.652664,0.644003
5,KNeighborsClassifier,0.725084,0.643263
