# Model Training

### *Required modules* 

In [20]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('E:/Projects/red_wine/Data/Raw/winequality-red.csv')

In [3]:
df.drop_duplicates(inplace=True, ignore_index = True)

In [4]:
X = df.drop(['quality'], axis=1)
y = df['quality']

In [5]:
over_sampler = RandomOverSampler(random_state = 42)
X_resampled, y_resampled = over_sampler.fit_resample(X, y)

In [6]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [7]:
corr_features = correlation(X_resampled, 0.7)
len(set(corr_features))

1

In [8]:
X_resampled = X_resampled.drop(['citric acid'], axis=1)

In [11]:
y_resampled = y_resampled.apply(lambda x: 0 if x>7 else 1)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.20, random_state=42)

In [16]:
# pipeline = Pipeline(
#     steps=[
#         ('preprocessor_name', preprocessor)
#         ,('model_name', model())]
# )

In [19]:
numeric_features = list(X_resampled.columns)
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())])

In [21]:
preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, numeric_features)])

In [23]:
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor',RandomForestClassifier())
           ])

In [24]:
rf_model = pipeline.fit(X_train, y_train)
print (rf_model)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['fixed acidity',
                                                   'volatile acidity',
                                                   'residual sugar',
                                                   'chlorides',
                                                   'free sulfur dioxide',
                                                   'total sulfur dioxide',
                                                   'density', 'pH', 'sulphates',
                                                   'alcohol'])])),
         

In [25]:
from sklearn.metrics import r2_score
predictions = rf_model.predict(X_test)
print (r2_score(y_test, predictions))

0.9891938250428817
