In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score


In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
def preprocess_data():
    """
    This function clean the dataframe
    return the splitted train and test sets, 
    return the list of the numeric and categrial columns
    """
    
    df = pd.read_csv('data/output_1.csv')
    df.drop(['coil','furnace Number','Temperature before finishing mill',
             'Temperature after finishing mill','Thickness profile','Constriction_width'],axis=1, inplace = True)
    
    return df
    

In [5]:
def balance_sample_up():
    df =preprocess_data()
    # Separate majority and minority classes
    df_majority = df[df.is_constriction == 0]
    df_minority = df[df.is_constriction == 1]
    
 
    #Upsample minority class
    df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples = 18845,    # to match majority class
                                 random_state =123) # reproducible results
 
    # Combine majority class with upsampled minority class
    df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
    # Display new class counts
    print("Up sampled:", df_upsampled.is_constriction.value_counts())
    

    y = df_upsampled.is_constriction
    X = df_upsampled.drop('is_constriction', axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=1/5,random_state=0) 
    return X_train, X_test, y_train, y_test, df_upsampled





In [6]:
def balance_sample_down():
    df =preprocess_data()
    # Separate majority and minority classes
    df_majority = df[df.is_constriction == 0]
    df_minority = df[df.is_constriction == 1]
    
 
    #downsample majority class
    df_majority_downsampled = resample(df_majority, 
                                 replace=False,     # sample with replacement
                                 n_samples = 1725,    # to match majority class
                                 random_state =123) # reproducible results
 
    # Combine majority class with upsampled minority class
    df_downsampled = pd.concat([df_majority_downsampled, df_minority])
 
    # Display new class counts
    print("Down sampled: ", df_downsampled.is_constriction.value_counts())

    y = df_downsampled.is_constriction
    X = df_downsampled.drop('is_constriction', axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=1/5,random_state=0) 

    return X_train, X_test, y_train, y_test, df_downsampled





In [7]:
def build_model(model,df):

    num_attribs = df.select_dtypes(include=np.number).columns.tolist() 
    remove_attribs =['coil','is_constriction']
    num_attribs = [i for i in num_attribs if i not in remove_attribs]
    cat_attribs = ['analyse']
    
    num_tr_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),])
    
    cat_tr_pipeline = Pipeline([
        ('one_hot_encoder', OneHotEncoder(sparse=False, handle_unknown='ignore')),])
    preprocessors = ColumnTransformer([
        ("num_tr_pipeline", num_tr_pipeline, num_attribs),
        ("cat_tr_pipeline", cat_tr_pipeline, cat_attribs),])
    
    pipe =Pipeline([
    ('prepocessors',preprocessors),
    ('classifier_model',model),])
    
    return pipe

In [8]:
def evaluate_models(balance_method):

    result = [] 

    if balance_method == 'up_sampling':
        print('Model with up sampled minority class')
        X_train, X_test, y_train, y_test,df = balance_sample_up()
    else:
        print("Model with down sampled majority class")
        X_train, X_test, y_train, y_test,df = balance_sample_down()

    
    models = {"Logestic_Regression": LogisticRegression(solver = 'liblinear'),
              "Random_Forest": RandomForestClassifier(n_estimators=200),
              "Random_Forest_grid": RandomForestClassifier( max_depth=8, max_features= 1.0, n_estimators= 10),
              "knn": KNeighborsClassifier(n_neighbors=5,p=3,metric ='euclidean'),
              'SVM': SVC(kernel='linear'),
              "Quadratic Discriminant": QuadraticDiscriminantAnalysis(reg_param = 0.9)
               }
    
    
    for model_name, model in models.items():
        print(f"{model_name} is built")
        pipe = build_model(model, df)
        pipe.fit(X_train,y_train)
        y_pred = pipe.predict(X_test)
        score = pipe.score(X_test, y_test)
        cm = metrics.confusion_matrix(y_test, y_pred) # confusion matrix
        f1 = f1_score(y_test, y_pred)
        f2 = fbeta_score(y_test, y_pred, beta=2, average='binary')
        f_beta = fbeta_score(y_test, y_pred, average='macro', beta=0.5)

        result.append({"Model":model_name, "Score":score, 'Confusion_matrix':cm, 
                       'F1_score':f1, 'F2_score':f2, 'Fbeta_score':f_beta})
        
    result_df = pd.DataFrame(result)
    
    return result_df
        
        
    

In [9]:

test1_matrix = evaluate_models('up_sampling')
test1_matrix.head()
    

Model with up sampled minority class
Up sampled: 0    18845
1    18845
Name: is_constriction, dtype: int64
Logestic_Regression is built
Random_Forest is built
Random_Forest_grid is built
knn is built
SVM is built
Quadratic Discriminant is built


Unnamed: 0,Model,Score,Confusion_matrix,F1_score,F2_score,Fbeta_score
0,Logestic_Regression,0.786415,"[[2899, 796], [814, 3029]]",0.790037,0.788925,0.786339
1,Random_Forest,0.984081,"[[3575, 120], [0, 3843]]",0.984627,0.993794,0.98448
2,Random_Forest_grid,0.869329,"[[3330, 365], [620, 3223]]",0.867447,0.849947,0.869874
3,knn,0.945344,"[[3293, 402], [10, 3833]]",0.948997,0.977457,0.948002
4,SVM,0.791324,"[[2878, 817], [756, 3087]]",0.796954,0.800737,0.791261


In [10]:
test2_matrix = evaluate_models('down_sampling')
test2_matrix.head()

Model with down sampled majority class
Down sampled:  0    1725
1    1725
Name: is_constriction, dtype: int64
Logestic_Regression is built
Random_Forest is built
Random_Forest_grid is built
knn is built
SVM is built
Quadratic Discriminant is built


Unnamed: 0,Model,Score,Confusion_matrix,F1_score,F2_score,Fbeta_score
0,Logestic_Regression,0.772464,"[[272, 84], [73, 261]]",0.768778,0.776324,0.772394
1,Random_Forest,0.847826,"[[302, 54], [51, 283]]",0.843517,0.845786,0.847666
2,Random_Forest_grid,0.82029,"[[297, 59], [65, 269]]",0.812689,0.808293,0.820152
3,knn,0.811594,"[[295, 61], [69, 265]]",0.80303,0.797232,0.811465
4,SVM,0.766667,"[[271, 85], [76, 258]]",0.762186,0.768314,0.766552


In [11]:
test2_matrix.to_csv('data/csv_data/model_evaluation.csv', index = False, header = True )