In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
#from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def preprocess_data():
    """
    This function clean the dataframe
    return the splitted train and test sets, 
    return the list of the numeric and categrial columns
    """
    
    df = pd.read_csv('data/output_1.csv')
    df.drop(['coil','furnace Number','Temperature before finishing mill',
             'Temperature after finishing mill','Thickness profile','Constriction_width'],axis=1, inplace = True)
    num_attribs = df.select_dtypes(include=np.number).columns.tolist() 
    remove_attribs =['coil','is_constriction']
    num_attribs = [i for i in num_attribs if i not in remove_attribs]
    cat_attribs = ['analyse']
    y= df['is_constriction'].copy().to_numpy()
    X = df.drop('is_constriction', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=1/5,random_state=0,stratify = y) 

    return X_train, X_test, y_train, y_test, num_attribs, cat_attribs
    

In [4]:
def build_model(model, num_attribs, cat_attribs):
    
    num_tr_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),])
    
    cat_tr_pipeline = Pipeline([
        ('one_hot_encoder', OneHotEncoder(sparse=False, handle_unknown='ignore')),])
    preprocessors = ColumnTransformer([
        ("num_tr_pipeline", num_tr_pipeline, num_attribs),
        ("cat_tr_pipeline", cat_tr_pipeline, cat_attribs),])
    
    pipe =Pipeline([
    ('prepocessors',preprocessors),
    ('classifier_model',model),])
    
    return pipe

In [5]:
def evaluate_models():
    
    X_train, X_test, y_train, y_test, num_attribs, cat_attribs = preprocess_data()
    
    models = {"Logestic_Regression": LogisticRegression(solver = 'liblinear'),
              "Random_Forest": RandomForestClassifier(n_estimators=200),
              "knn": KNeighborsClassifier(n_neighbors=5,p=3,metric ='euclidean'),
              'SVM': SVC(kernel='linear') }
    result = []
    
    for model_name, model in models.items():
        print(f"{model_name} is created")
        pipe = build_model(model, num_attribs, cat_attribs)
        pipe.fit(X_train,y_train)
        y_pred = pipe.predict(X_test)
        
        score = pipe.score(X_test, y_test)
        cm = metrics.confusion_matrix(y_test, y_pred) # confusion matrix
        f1 = f1_score(y_test, y_pred)

        result.append({"Model":model_name, "Score":score, 'Confusion_matrix':cm, 'F1_score':f1})
        
    result_df = pd.DataFrame(result)
    
    return result_df
        
        
    

In [6]:
def start_modeling():
   print(evaluate_models().head())

start_modeling()
    

Logestic_Regression is created
Random_Forest is created
knn is created
SVM is created
                 Model   Score         Confusion_matrix  F1_score
0  Logestic_Regression  0.9285  [[1845, 10], [133, 12]]  0.143713
1        Random_Forest  0.9295  [[1816, 39], [102, 43]]  0.378855
2                  knn  0.9340   [[1822, 33], [99, 46]]  0.410714
3                  SVM  0.9305    [[1852, 3], [136, 9]]  0.114650
