In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
data = pd.read_csv('../data/WineQT_raw.csv')
Xtrain,Xrest,ytrain,yrest = train_test_split(data.drop('quality',axis=1),data['quality'],test_size=0.4,random_state=42)
Xval,Xtest,yval,ytest = train_test_split(Xrest,yrest,test_size=0.5,random_state=42)

In [2]:
display(Xtrain.info())
display(Xtrain.head())

<class 'pandas.core.frame.DataFrame'>
Index: 685 entries, 1137 to 1126
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         685 non-null    float64
 1   volatile acidity      685 non-null    float64
 2   citric acid           685 non-null    float64
 3   residual sugar        685 non-null    float64
 4   chlorides             685 non-null    float64
 5   free sulfur dioxide   685 non-null    float64
 6   total sulfur dioxide  685 non-null    float64
 7   density               685 non-null    float64
 8   pH                    685 non-null    float64
 9   sulphates             685 non-null    float64
 10  alcohol               685 non-null    float64
 11  Id                    685 non-null    int64  
dtypes: float64(11), int64(1)
memory usage: 69.6 KB


None

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,Id
1137,5.4,0.74,0.09,1.7,0.089,16.0,26.0,0.99402,3.67,0.56,11.6,1591
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
477,8.2,0.73,0.21,1.7,0.074,5.0,13.0,0.9968,3.2,0.52,9.5,673
155,7.5,0.49,0.19,1.9,0.076,10.0,44.0,0.9957,3.39,0.54,9.7,218
895,7.2,0.57,0.05,2.3,0.081,16.0,36.0,0.99564,3.38,0.6,10.3,1266


In [3]:
def data_preprocessing(data : pd.DataFrame):
    for column in data.columns:
        if column == 'quality':
            data[column] = data[column].fillna(data[column].mean())
        else:   
            data[column] = data[column].fillna(data[column].mean())
    return data

Xtrain = data_preprocessing(Xtrain)
Xval = data_preprocessing(Xval)
Xtest = data_preprocessing(Xtest)
data = data_preprocessing(data)


In [4]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

standart_scaler = StandardScaler()
normal_scaler = MinMaxScaler()

Xtrain_standart = pd.DataFrame(standart_scaler.fit_transform(Xtrain), columns=Xtrain.columns)
Xval_standart = pd.DataFrame(standart_scaler.transform(Xval), columns=Xval.columns)
Xtest_standart = pd.DataFrame(standart_scaler.transform(Xtest), columns=Xtest.columns)

Xtrain_normal = pd.DataFrame(normal_scaler.fit_transform(Xtrain), columns=Xtrain.columns)
Xval_normal = pd.DataFrame(normal_scaler.transform(Xval), columns=Xval.columns)
Xtest_normal = pd.DataFrame(normal_scaler.transform(Xtest), columns=Xtest.columns)

standart_data = pd.DataFrame(standart_scaler.fit_transform(data), columns=data.columns)
normal_data = pd.DataFrame(normal_scaler.fit_transform(data), columns=data.columns)

standart_data.to_csv('../data/WineQT_standart.csv', index=False)
normal_data.to_csv('../data/WineQT_normal.csv', index=False)

In [5]:
import sys
sys.path.append('../')
from sklearn.model_selection import ParameterGrid
from src.logistic_regression import LogisticRegression
from sklearn.linear_model import LogisticRegression as sklearn_LogisticRegression
from src.knn import KNN
from sklearn.neighbors import KNeighborsClassifier as sklearn_KNeighborsClassifier
from src.tree_classifier import TreeClassifier
from sklearn.tree import DecisionTreeClassifier as sklearn_DecisionTreeClassifier
def model_tuning(model_name,Xdata,Xvaldata,sklearn_model):
    if model_name == 'LogisticRegression':
        if sklearn_model == False:
            parameters = {
                'learning_rate': [0.001, 0.005, 0.01],
                'n_iters': [100,200,300],
                'fit_intercept': [True, False]
            }
            parameters_grid = ParameterGrid(parameters)
            best_params = None
            best_score = 0
            for params in parameters_grid:
                model = LogisticRegression(**params)
                model.fit(Xdata,ytrain)
                score = model.score(Xvaldata,yval)
                if score > best_score:
                    best_score = score
                    best_params = params
            print(f"Best score  for all variants in non-sklearn LogisticRegression: {best_score}")
            print(f"Best params: {best_params}")
        else:
            parameters = {
                'C': [0.001, 0.01, 0.1, 1, 10, 100],
                'solver': ['lbfgs', 'newton-cg', 'sag', 'saga'],
                'max_iter': [100,200,300]
            }
            parameters_grid = ParameterGrid(parameters)
            best_params = None
            best_score = 0
            for params in parameters_grid:
                model = sklearn_LogisticRegression(**params)
                model.fit(Xdata,ytrain)
                score = model.score(Xvaldata,yval)
                if score > best_score:
                    best_score = score
                    best_params = params
            print(f"Best score  for all variants in sklearn LogisticRegression: {best_score}")
            print(f"Best params: {best_params}")
    elif model_name == 'KNN':
        if sklearn_model == False:
            parameters = {
                'n_neighbors': [3, 5, 7, 9],
                'p': [1, 2],
                'weights': ['uniform', 'distance'],
                'task_class': ['c']
            }
            parameters_grid = ParameterGrid(parameters)
            best_params = None
            best_score = 0
            for params in parameters_grid:
                model = KNN(**params)
                model.fit(Xdata,ytrain)
                score = model.score(Xvaldata,yval)
                if score > best_score:
                    best_score = score
                    best_params = params
            print(f"Best score  for all variants in non-sklearn KNN: {best_score}")
            print(f"Best params: {best_params}")
        else:
            parameters = {
                'n_neighbors': [3, 5, 7, 9],
                'p': [1, 2],
                'weights': ['uniform', 'distance'],
            }
            parameters_grid = ParameterGrid(parameters)
            best_params = None
            best_score = 0
            for params in parameters_grid:
                model = sklearn_KNeighborsClassifier(**params)
                model.fit(Xdata,ytrain)
                score = model.score(Xvaldata,yval)
                if score > best_score:
                    best_score = score
                    best_params = params
            print(f"Best score  for all variants in sklearn KNN: {best_score}")
            print(f"Best params: {best_params}")
    else:
        if sklearn_model == False:
            parameters = {
                'max_depth': range(2,10),
                'min_samples_split': [2, 3, 4,5],
                'criterion': ['gini', 'entropy']
            }
            parameters_grid = ParameterGrid(parameters)
            best_params = None
            best_score = 0
            for params in parameters_grid:
                model = TreeClassifier(**params)
                model.fit(Xdata,ytrain)
                score = model.score(Xvaldata,yval)
                if score > best_score:
                    best_score = score
                    best_params = params
            print(f"Best score  for all variants in non-sklearn TreeClassifier: {best_score}")
            print(f"Best params: {best_params}")
        else:
            parameters = {
                'max_depth': range(2,10),
                'min_samples_split': [2, 3, 4,5],
                'criterion': ['gini', 'entropy']
            }
            parameters_grid = ParameterGrid(parameters)
            best_params = None
            best_score = 0
            for params in parameters_grid:
                model = sklearn_DecisionTreeClassifier(**params)
                model.fit(Xdata,ytrain)
                score = model.score(Xvaldata,yval)
                if score > best_score:
                    best_score = score
                    best_params = params
            print(f"Best score  for all variants in sklearn TreeClassifier: {best_score}")
            print(f"Best params: {best_params}")


In [6]:
model_tuning('KNN',Xtrain_standart,Xval_standart,True)
model_tuning('KNN',Xtrain_normal,Xval_normal,True)
model_tuning('KNN',Xtrain_standart,Xval_standart,False)
model_tuning('KNN',Xtrain_normal,Xval_normal,False)

Best score  for all variants in sklearn KNN: 0.6637554585152838
Best params: {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
Best score  for all variants in sklearn KNN: 0.6506550218340611
Best params: {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
Best score  for all variants in non-sklearn KNN: 0.6200873362445415
Best params: {'n_neighbors': 9, 'p': 2, 'task_class': 'c', 'weights': 'uniform'}
Best score  for all variants in non-sklearn KNN: 0.5938864628820961
Best params: {'n_neighbors': 9, 'p': 2, 'task_class': 'c', 'weights': 'uniform'}


In [6]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
model_tuning('LogisticRegression',Xtrain_standart,Xval_standart,True)
model_tuning('LogisticRegression',Xtrain_normal,Xval_normal,True)
model_tuning('LogisticRegression',Xtrain_standart,Xval_standart,False)
model_tuning('LogisticRegression',Xtrain_normal,Xval_normal,False)

Best score  for all variants in sklearn LogisticRegression: 0.6375545851528385
Best params: {'C': 1, 'max_iter': 100, 'solver': 'lbfgs'}
Best score  for all variants in sklearn LogisticRegression: 0.6375545851528385
Best params: {'C': 100, 'max_iter': 100, 'solver': 'lbfgs'}
Best score  for all variants in non-sklearn LogisticRegression: 0.5982532751091703
Best params: {'fit_intercept': True, 'learning_rate': 0.005, 'n_iters': 200}
Best score  for all variants in non-sklearn LogisticRegression: 0.47161572052401746
Best params: {'fit_intercept': False, 'learning_rate': 0.01, 'n_iters': 300}


In [6]:
model_tuning('ClassificationTree',Xtrain_standart,Xval_standart,True)
model_tuning('ClassificationTree',Xtrain_normal,Xval_normal,True)
model_tuning('ClassificationTree',Xtrain_standart,Xval_standart,False)
model_tuning('ClassificationTree',Xtrain_normal,Xval_normal,False)

Best score  for all variants in sklearn TreeClassifier: 0.6200873362445415
Best params: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 4}
Best score  for all variants in sklearn TreeClassifier: 0.6157205240174672
Best params: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 3}
Best score  for all variants in non-sklearn TreeClassifier: 0.6200873362445415
Best params: {'criterion': 'gini', 'max_depth': 7, 'min_samples_split': 4}
Best score  for all variants in non-sklearn TreeClassifier: 0.6200873362445415
Best params: {'criterion': 'gini', 'max_depth': 7, 'min_samples_split': 4}


In [7]:
best_params_sklearn_lr = {'C': 1, 'max_iter': 100, 'solver': 'lbfgs'}
best_params_custom_lr = {'learning_rate': 0.01, 'n_iters': 200, 'fit_intercept': True}

best_params_sklearn_knn = {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
best_params_custom_knn = {'n_neighbors': 9, 'p': 2, 'weights': 'uniform', 'task_class': 'c'}

best_params_sklearn_tree = {'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 4}
best_params_custom_tree = {'criterion': 'entropy', 'max_depth': 2, 'min_samples_split': 2}

sklearn_lr = sklearn_LogisticRegression(**best_params_sklearn_lr)
sklearn_lr.fit(Xtrain_standart, ytrain)
sklearn_lr_test_score = sklearn_lr.score(Xtest_standart, ytest)

custom_lr = LogisticRegression(**best_params_custom_lr)
custom_lr.fit(Xtrain_standart, ytrain)
custom_lr_test_score = custom_lr.score(Xtest_standart, ytest)

print(f'Final test accuracy (sklearn LogisticRegression): {sklearn_lr_test_score:.4f} | Params: {best_params_sklearn_lr}')
print(f'Final test accuracy (custom LogisticRegression): {custom_lr_test_score:.4f} | Params: {best_params_custom_lr}')

sklearn_knn = sklearn_KNeighborsClassifier(**best_params_sklearn_knn)
sklearn_knn.fit(Xtrain_standart, ytrain)
sklearn_knn_test_score = sklearn_knn.score(Xtest_standart, ytest)

custom_knn = KNN(**best_params_custom_knn)
custom_knn.fit(Xtrain_standart, ytrain)
custom_knn_test_score = custom_knn.score(Xtest_standart, ytest)

print(f'Final test accuracy (sklearn KNN): {sklearn_knn_test_score:.4f} | Params: {best_params_sklearn_knn}')
print(f'Final test accuracy (custom KNN): {custom_knn_test_score:.4f} | Params: {best_params_custom_knn}')

sklearn_tree = sklearn_DecisionTreeClassifier(**best_params_sklearn_tree)
sklearn_tree.fit(Xtrain_standart, ytrain)
sklearn_tree_test_score = sklearn_tree.score(Xtest_standart, ytest)

custom_tree = TreeClassifier(**best_params_custom_tree)
custom_tree.fit(Xtrain_standart, ytrain)
custom_tree_test_score = custom_tree.score(Xtest_standart, ytest)

print(f'Final test accuracy (sklearn TreeClassifier): {sklearn_tree_test_score:.4f} | Params: {best_params_sklearn_tree}')
print(f'Final test accuracy (custom TreeClassifier): {custom_tree_test_score:.4f} | Params: {best_params_custom_tree}')

Final test accuracy (sklearn LogisticRegression): 0.5677 | Params: {'C': 1, 'max_iter': 100, 'solver': 'lbfgs'}
Final test accuracy (custom LogisticRegression): 0.5328 | Params: {'learning_rate': 0.01, 'n_iters': 200, 'fit_intercept': True}
Final test accuracy (sklearn KNN): 0.6157 | Params: {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
Final test accuracy (custom KNN): 0.5415 | Params: {'n_neighbors': 9, 'p': 2, 'weights': 'uniform', 'task_class': 'c'}
Final test accuracy (sklearn TreeClassifier): 0.5459 | Params: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 4}
Final test accuracy (custom TreeClassifier): 0.5284 | Params: {'criterion': 'entropy', 'max_depth': 2, 'min_samples_split': 2}


In [None]:
import os
import joblib

os.makedirs('../models/classification', exist_ok=True)

joblib.dump(sklearn_lr, '../models/classification/sklearn_logistic_regression.joblib')
print('Saved sklearn LogisticRegression to models/classification/sklearn_logistic_regression.joblib')

joblib.dump(custom_lr, '../models/classification/custom_logistic_regression.joblib')
print('Saved custom LogisticRegression to models/classification/custom_logistic_regression.joblib')

joblib.dump(sklearn_knn, '../models/classification/sklearn_knn.joblib')
print('Saved sklearn KNN to models/classification/sklearn_knn.joblib')

joblib.dump(custom_knn, '../models/classification/custom_knn.joblib')
print('Saved custom KNN to models/classification/custom_knn.joblib')

joblib.dump(sklearn_tree, '../models/classification/sklearn_tree_classifier.joblib')
print('Saved sklearn TreeClassifier to models/classification/sklearn_tree_classifier.joblib')


joblib.dump(custom_tree, '../models/classification/custom_tree_classifier.joblib')
print('Saved custom TreeClassifier to models/classification/custom_tree_classifier.joblib')