# Model Training

### *Required modules* 

In [1]:
import warnings
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import r2_score, accuracy_score

In [2]:
warnings.filterwarnings("ignore")

### *Loading dataset from source* 

In [3]:
df = pd.read_csv('E:/Projects/red_wine/Data/Raw/winequality-red.csv')

### *Droping duplicate rows* 

In [4]:
df.drop_duplicates(inplace=True, ignore_index = True)

### *Performing oversampling for handling imbalanced dataset* 

In [5]:
X = df.drop(['quality'], axis=1)
y = df['quality']

In [6]:
over_sampler = RandomOverSampler(random_state = 42)
X_resampled, y_resampled = over_sampler.fit_resample(X, y)

### *Feature selection* 

In [7]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [8]:
corr_features = correlation(X_resampled, 0.7)
len(set(corr_features))

1

In [9]:
X_resampled = X_resampled.drop(['citric acid'], axis=1)

In [10]:
y_resampled = y_resampled.apply(lambda x: 0 if x>7 else 1)

### *Dividing dataset into X_train, X_test, y_train & y_test* 

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.20, random_state=42)

### *Pipeline creation*

#### *Pipeline for numeric_features transformation* 

In [12]:
numeric_features = list(X_resampled.columns)
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())])

#### *Creating a column_transformar preprocessor for transformation of numeric & categorical features* 

In [13]:
preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, numeric_features)])

#### *Integrating preprocessor transformer and different different models into one pipeline for model creation and trying to train this pipeline on X_train and y_train data* 

In [14]:
pipeline = Pipeline(steps = [
    ('preprocessor', preprocessor)  
])
X_train = pipeline.fit_transform(X_train)


### *Selection of best model with grid searchcv*

#### *classifier list for different different classifier models* 

In [26]:
models = {
    "Logistic Classifier":LogisticRegression(), 
    "Random Forest Classifier":RandomForestClassifier(), 
    "Support Vector Classifier":SVC(), 
    "K Neighbors Classifier":KNeighborsClassifier(), 
    "XGB Classifier":XGBClassifier()
}

In [27]:
params = {
    "Logistic Classifier":{
        'tol':[1e-2, 1e-3, 1e-4, 1e-5],
        'C':[0.5, 0.75, 1, 1.5, 2],
        'solver':['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
    },
    "Random Forest Classifier":{
        'n_estimators':[90, 100, 110, 120],
        'criterion':['gini', 'entropy', 'log_loss'],
        'max_features':['sqrt', 'log2', None]
    },
    "Support Vector Classifier":{
        'C':[1,1.5,2,2.5,3],
        'gamma':['scale', 'auto'],
    },
    "K Neighbors Classifier":{
        'n_neighbors':[4,5,6,7],
        'weights':['uniform', 'distance'],
        'p':[1,2],
    },
    "XGB Classifier":{
        'n_estimators':[2,3,4,5],
        'learning_rate':[0.5,1,1.5,2],
    }
}

In [28]:
accuracy_report = {}
params_report = {}

In [29]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    param = params[list(models.keys())[i]]
    
    gs = GridSearchCV(model, param, cv=3)
    gs.fit(X_train, y_train)
    
    model.set_params(**gs.best_params_)
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    train_model_score = accuracy_score(y_train, y_train_pred)
    test_model_score = accuracy_score(y_test, y_test_pred)
    
    accuracy_report[list(models.keys())[i]] = test_model_score
    params_report[list(models.keys())[i]] = gs.best_params_

In [30]:
print(accuracy_report,"\n\n\n",params_report)

{'Logistic Classifier': 0.8152958152958153, 'Random Forest Classifier': 0.8412698412698413, 'Support Vector Classifier': 0.8412698412698413, 'K Neighbors Classifier': 0.8412698412698413, 'XGB Classifier': 0.797979797979798} 


 {'Logistic Classifier': {'C': 0.75, 'solver': 'sag', 'tol': 0.01}, 'Random Forest Classifier': {'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 100}, 'Support Vector Classifier': {'C': 3, 'gamma': 'scale'}, 'K Neighbors Classifier': {'n_neighbors': 4, 'p': 2, 'weights': 'distance'}, 'XGB Classifier': {'learning_rate': 2, 'n_estimators': 5}}


### *Model training with selected model and parameters*

In [31]:
best_accuracy = max(sorted(accuracy_report.values()))
best_model_name = list(accuracy_report.keys())[
    list(accuracy_report.values()).index(best_accuracy)
]   
model = models[best_model_name]
params = params_report[best_model_name]

model.set_params(**params)
model.fit(X_train, y_train)