In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport
%matplotlib inline

In [None]:
df_red=pd.read_csv('winequality-red.csv',delimiter=';')

In [None]:
df_white=pd.read_csv('winequality-white.csv',delimiter=';')

In [None]:
df_red

In [None]:
df_white

In [None]:
profile_red = ProfileReport(df_red, title='Pandas Profiling Report', html={'style':{'full_width':True}})

In [None]:
profile_white= ProfileReport(df_white, title='Pandas Profiling Report', html={'style':{'full_width':True}})

In [None]:
profile_red

In [None]:
profile_red.to_file(output_file="your_report.html")

In [None]:
profile_white

In [None]:
profile_white.to_file(output_file="your_report_white.html")

In [None]:
# removing the duplicates rows in red wine's table
df_red.drop_duplicates(inplace=True)

In [None]:
df_red.shape

In [None]:
# removing the duplicates rows in white wine's table
df_white.drop_duplicates(inplace=True)

In [None]:
df_white.shape

In [None]:
# calculating the quality median for the two tables
print(df_red['quality'].median())
print(df_white['quality'].median())

In [None]:
# change the label for red and white wine's tables
# red wine
df_red['target']=np.where(df_red['quality']<6,0,1)

# white wine
df_white['target']=np.where(df_white['quality']<6,0,1)

In [None]:
df_red.head()

In [None]:
# dropping quality column in red wine's table
df_red.drop('quality',axis=1,inplace=True)

In [None]:
df_white.head()

In [None]:
# dropping quality column in white wine's table
df_white.drop('quality',axis=1,inplace=True)

In [None]:
# checking that the data is balanced
df_red['target'].value_counts()/len(df_red)

In [None]:
# checking that the data is balanced
df_white['target'].value_counts()/len(df_white)

In [None]:
df_white = pd.concat([df_white,df_white[df_white['target']==0]],axis=0)

In [None]:
# checking that the data is balanced
df_white['target'].value_counts()/len(df_white)

In [None]:
# defining the features and the target
# red
X_red = df_red.iloc[:,0:-1]
y_red = df_red.iloc[:,-1]

# wine
X_white = df_white.iloc[:,0:-1]
y_white = df_white.iloc[:,-1]

In [None]:
print(X_white.shape,y_white.shape)

In [None]:
df_white['target'].value_counts()

In [None]:
df_white.shape

In [None]:
# calculating the VIF for features selections

In [None]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# For each X, calculate VIF and save in dataframe

# red
vif_red = pd.DataFrame()
vif_red["VIF Factor"] = [variance_inflation_factor(X_red.values, i) for i in range(X_red.shape[1])]
vif_red["features"] = X_red.columns

In [None]:
# white
vif_white = pd.DataFrame()
vif_white["VIF Factor"] = [variance_inflation_factor(X_white.values, i) for i in range(X_white.shape[1])]
vif_white["features"] = X_white.columns

In [None]:
vif_red.to_csv('vif_red.csv')

In [None]:
vif_white.to_csv('vif_white.csv')

In [None]:
# choosing the features
vif_red

In [None]:
vif_white

In [None]:
# drop the features that we don't need
X_red.drop(['density','pH','alcohol','fixed acidity'],axis=1, inplace=True)

X_white.drop(['density','pH','alcohol','fixed acidity'],axis=1, inplace=True)

In [None]:
X_red.shape

In [None]:
y_red.shape

In [None]:
# split the data into train and test
from sklearn.model_selection import train_test_split

X_red_train, X_red_test,y_red_train,y_red_test = train_test_split(X_red, y_red, test_size=0.20, random_state=42,stratify=y_red)

In [None]:
X_white_train, X_white_test,y_white_train,y_white_test = train_test_split(X_white, y_white, test_size=0.20, random_state=42,stratify=y_white)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
estimators = [('scaling', MinMaxScaler()),
              ('model', LogisticRegression())]
pipe = Pipeline(estimators)
param_grid = [
{'scaling':[StandardScaler(),MinMaxScaler(), None], 
 'model': [LogisticRegression()],
 'model__C':np.power(10.0, np.arange(-5, 5))},
{'scaling':[StandardScaler(),MinMaxScaler(), None], 
 'model': [SVC()],
 'model__C':np.power(10.0, np.arange(-5, 5)),
 'model__kernel':['rbf','linear','sigmoid'],
 'model__gamma':[1, 10, 100]}
]
grid_red = GridSearchCV(pipe, param_grid, cv=5, verbose=1, n_jobs=-1)
fittedgrid_red = grid_red.fit(X_red_train, y_red_train)

In [None]:
estimators = [('scaling', MinMaxScaler()),
              ('model', LogisticRegression())]
pipe = Pipeline(estimators)
param_grid = [
{'scaling':[StandardScaler(),MinMaxScaler(), None], 
 'model': [LogisticRegression()],
 'model__C':np.power(10.0, np.arange(-5, 5))},
{'scaling':[StandardScaler(),MinMaxScaler(), None], 
 'model': [SVC()],
 'model__C':np.power(10.0, np.arange(-5, 5)),
 'model__kernel':['rbf']}
]
grid_white = GridSearchCV(pipe, param_grid, cv=5, verbose=1, n_jobs=-1)
fittedgrid_white = grid_white.fit(X_white_train, y_white_train)

In [None]:
fittedgrid_red.best_estimator_

In [None]:
fittedgrid_red.score(X_red_test, y_red_test)

In [None]:
fittedgrid_white.best_estimator_

In [None]:
fittedgrid_white.score(X_white_test, y_white_test)