# Red Wine quality prediction

In [None]:
#importing the necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns

%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/dsrscientist/DSData/master/winequality-red.csv')

In [None]:
feature_val =df.columns #checking all the dependent or feature variables in the dataset

In [None]:
df.head() #checking the first 5 rows

In [None]:
len(df), len(feature_val)

The data have 1599 rows and 12 columns,
Also we are dealing with 12 features

# Data Exploration Analysis

first checking null values in the dataset

In [None]:
df.isnull().sum() #can see that there is no empty values present in all the columns 

Doing a further check on the data types for each of the columns

In [None]:
df.dtypes # all values in each column contains a continuous data

# Scatter Plots

In [None]:
sns.scatterplot(x= 'fixed acidity', y= 'quality',data = df) #scatter plot for fixed acidity and the target variable (quality)

In [None]:
sns.scatterplot(x= 'volatile acidity', y= 'quality',data = df)

In [None]:
sns.scatterplot(x= 'citric acid', y= 'quality',data = df)

In [None]:
sns.scatterplot(x= 'residual sugar', y= 'quality',data = df)

In [None]:
sns.scatterplot(x= 'chlorides', y= 'quality',data = df)

In [None]:
sns.scatterplot(x= 'free sulfur dioxide', y= 'quality',data = df)

In [None]:
sns.scatterplot(x= 'total sulfur dioxide', y= 'quality',data = df)

In [None]:
sns.scatterplot(x= 'density', y= 'quality',data = df)

In [None]:
sns.scatterplot(x= 'pH', y= 'quality',data = df)

In [None]:
sns.scatterplot(x= 'sulphates', y= 'quality',data = df)

In [None]:
sns.scatterplot(x= 'alcohol', y= 'quality',data = df)

In [None]:
df.corr()['quality'].sort_values() #checking the correlation between all the features column and target columns

In [None]:
df.corr() #correlation between all features

In [None]:
plt.figure(figsize = (15,7))
sns.heatmap(df.corr(),annot = True,linewidths = 0.5, linecolor = 'black',fmt = '.2f') #heatmap of correlations 

# observations
 
There are some negative correlations in the columns of the dataset with the target column

fixed acidity has a 12 percent correlation with the target variable which is a very weak bond with the target column, volatile acid has a 39 percent negative correlation with the target column which is a good bond, citric acid also have a weak bond with the target column with 23 percent correlation, rediual sugar has a 1 percent correlation with the target column which is very weak, chlorides 13 percent, free sulfur dioxide, 5 percent, total sulfur dioxide 5 percent, density 17 percent, pH 6 percent, sulphate 25 percent, alchol 48 percent. 

alcohol has the highest correlation with the target column
residual sugar has the lowest correlation with the target column

further feature selection will be done to assertain the best features

In [None]:
df.describe()#checking the statiscal breakdown of the data

In [None]:
plt.figure(figsize=(15,12))
sns.heatmap(round(df.describe()[1:].transpose(),2),linewidth=2,annot=True,fmt="f")
plt.xticks(fontsize=18)
plt.yticks(fontsize=12)
plt.title('Variables summary')
plt.show()  #taking a heatmap of the statiscal description of all features

# Observation of description of datasets

Total number of rows and columns in the dataset
1. 1599 rows
2. 12 columns

fixed acidity:

 1. mean=8.32
 2. std=1.74
 3. max_value =15.9
 4. min_value= 4.6

volatile acidity:

 1. mean= 0.53
 2. std= 0.18
 3. max_value =
 4. min_value =

citric acid:
 1. mean= 0.27
 2. std=  0.19
 3. max_value=1.0
 4. min_value = 0.0

residual sugar:
1. mean=2.54
2. std=1.41
3. max_value=15.50
4. min_value=0.90

chlorides:
   1. mean=0.09
   2. std=0.05
   3. max_value=0.61
   4. min_value=0.01

free sulfur dioxide:
   1. mean=46.47
   2. std=32.90
   3. max_value=72.0
   4. min_value=10.46

total sulfur dioxide:
   1. mean=46.47
   2. std=32.90
   3. max_value=289.0
   4. min_value=6.0

density:
   1. mean=1.0
   2. std=0.0
   3. max_value=1.0
   4. min_value=0.99

pH:
   1. mean=3.31
   2. std=0.15
   3. max_value=4.01
   4. min_value=2.74

sulphate:
   1. mean=0.66
   2. std=0.17
   3. max_value=2.0
   4. min_value=0.33

alcohol:
   1. mean=10.42
   2. std=1.07
   3. max_value= 14.90
   4. min_value= 8.40

quality:
   1. mean=5.64
   2. std=0.81
   3. max_value= 8.0
   4. min_value=3.0

In [None]:
df.info() # checking all data types and missing  in the dataframe

# checking Outliers

In [None]:
d_dims = (10,7)
fig, ax = plt.subplots(figsize=d_dims)
g = sns.boxplot(data =df, linewidth =2.5, ax = ax)
g.set_yscale("log") # checking outliers in the dataset

# observations
frome the above, it can be observed that only alcohol and quality ( target) has few outliers the rest has more outlier, also the data is narrowly spreaded in residual sugar and chlorides with many outliers. Further skewnes checks will be done on them.

# Checking Skewness

In [None]:
df.skew()

Threshold of skewness taken is +/-0.5, Columns which have skewness:
  1.  fixed acidity
  2. volatility acidity
  3. residual sugar
  4. chlorides
  5. free sulfur dioxide
  6. total sulfur dioxide
  7. suplhates
 

# Normal distribution curves


In [None]:
sns.distplot(df["fixed acidity"]) # distribution plots of all feature colums

In [None]:
sns.distplot(df["volatile acidity"])

In [None]:
sns.distplot(df["residual sugar"])

In [None]:
sns.distplot(df["chlorides"])

In [None]:
sns.distplot(df["free sulfur dioxide"])

In [None]:
sns.distplot(df["total sulfur dioxide"])

In [None]:
sns.distplot(df["density"])

In [None]:
sns.distplot(df["pH"])

In [None]:
sns.distplot(df["sulphates"])

In [None]:
sns.distplot(df["alcohol"])

From all the above normal distributions of each feature, we can conclude that the data is skewed

# Data cleaning

In [None]:
remove = pd.DataFrame([["0.013732","residual sugar","No","many"],["-0.050656","free sulfur dioxide","No","many"],["0.057731","pH","No","many"],["-0.128907","chlorides","No","many"],["-0.185100","total sulfur dioxide","No","many"]],columns=["Correlation with targer","column name","Normalised","outliers"])
remove # taking a dataframe of those features with low correlation with target variable and too many outliers

setting threshold of correlation as +/-0.2

the above data will be dropped based on the fact that they are uncorrelated with the target variable and also has too many outliers, a further check will be done to assertain this deletion

In [None]:
remove

In [None]:
df = df.drop(["residual sugar","free sulfur dioxide", "pH", "chlorides","total sulfur dioxide"], axis =1)

In [None]:
# setting an abitrary cuttoff for dependent variable,(wine quality) 7 or higher are set to 1(good) otherwise 0(not good)
rating = []
for row in df['quality']:
    if row >= 7:
        rating.append(1)
    else:
        rating.append(0)
df['rating'] = rating # adding a new column to the dataset this will be my target variable
df

In [None]:
df.shape

# Removing outliers

In [None]:
y = df['rating']#dependent variables
x = df.drop(['quality','rating'], axis =1) #independent variables

In [None]:
y

In [None]:
from scipy.stats import zscore
z = np.abs(zscore(x))
z.shape

In [None]:
threshold = 3
print(np.where(z > 3))

In [None]:
x = x[(z<3).all(axis=1)]
x.shape

In [None]:
y = y[(z<3).all(axis=1)]
y.shape

# percentage data loss

In [None]:
loss_percent = (1599 - 1531)/1599*100
print(loss_percent)

# Transforming data to remove skewness

In [None]:
from sklearn.preprocessing import power_transform
x = power_transform(x, method='yeo-johnson')

In [None]:
x

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(x)
x

In [None]:
#oversampling the data
from imblearn.over_sampling import SMOTE
sm = SMOTE()
x,y = sm.fit_resample(x,y)

In [None]:
y.value_counts() #checking value count of target column

In [None]:
len(y)

In [None]:
from sklearn.model_selection import train_test_split #importing the necessary libraries for machine learning
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, accuracy_score

# spliting the dataset

In [None]:
for i in range(0,1000): #checking the perfomance of random different states of the model
    x_train,x_test,y_train,y_test = train_test_split(x,y, random_state =i, test_size = 0.20)
    lr.fit(x_train,y_train)
    pred_train= lr.predict(x_train)
    pred_test = lr.predict(x_test)
    if round(accuracy_score(y_train,pred_train)*100,1)== round(accuracy_score(y_test,pred_test)*100,1):
        print('At random state',i, 'the model perfoms very well')
        print('At random state:-', i)
        print('Training accuracy is:', accuracy_score(y_train,pred_train)*100)
        print('Testing accuracy is:', accuracy_score(y_test,pred_test)*100)
        print('\n')

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = .20, random_state = 203) # choosing the perfect model for further modeling

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,pred_test)) # checking the confusion matrix of the model

# Cross validation

In [None]:
pred_lr = lr.predict(x_test)                                     # taking the cross validation and accuracu score of range between 2 and 10
from sklearn.model_selection import cross_val_score
lss = accuracy_score(y_test,pred_lr)
for i in range(2,10):
    lsscore = cross_val_score(lr,x,y, cv =i)
    lsc = lsscore.mean()
    print('At cv:',i)
    print('cross validation score is:-', lsc*100)
    print('accuracy_score is:-',lss*100)
    
    print('\n')

The scores are almost the same in the above cross-validation we will take the one with least difference between the accuracy on test data and cross validation

In [None]:
lsscore_selected = cross_val_score(lr,x,y, cv = 8).mean()
print('The cv score is:',lsscore_selected,'\nThe accuracy score is:', lss)

Now we check with different models

In [None]:
RFC = RandomForestClassifier() #random forest classification
RFC.fit(x_train,y_train)
predRFC = RFC.predict(x_test)
print(accuracy_score(y_test,predRFC))
print(confusion_matrix(y_test,predRFC))
print(classification_report(y_test,predRFC))

In [None]:
score = cross_val_score(RFC,x,y) # cross validation scores with default cv of 5
print(score)
print(score.mean())
print('Difference between accuracy and cross validation score is', accuracy_score(y_test,predRFC)-score.mean())

In [None]:
from sklearn.svm import SVC # support vector classification
svc = SVC()
svc.fit(x_train, y_train)
predsvc = svc.predict(x_test)
print(accuracy_score(y_test, predsvc))
print(confusion_matrix(y_test,predsvc))
print(classification_report(y_test,predsvc))

In [None]:
#checking cv score for support vector machine classifier
score = cross_val_score(svc,x,y)
print(score)
print(score.mean())
print('Difference between accuracy and cross validation score is', accuracy_score(y_test,predsvc)-score.mean())

In [None]:
#checking accuracy for Decision Tree classifier
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)
predtc = dtc.predict(x_test)
print(accuracy_score(y_test,predtc))
print(confusion_matrix(y_test,predtc))
print(classification_report(y_test,predtc))

In [None]:
score = cross_val_score(dtc,x,y)
print(score)
print(score.mean())
print('Difference between accuracy and cross validation score is', accuracy_score(y_test,predtc)-score.mean())

# Hyper parameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters={"splitter":["best","random"],
            "max_depth" : [1,3,5,7,9,11,12],
           "min_samples_leaf":[1,2,3,4,5,6,7,8,9,10],
           "min_weight_fraction_leaf":[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
           "max_features":["auto","log2","sqrt",None],
           "max_leaf_nodes":[None,10,20,30,40,50,60,70,80,90] }

In [None]:
tuning_model = GridSearchCV(dtc,param_grid=parameters, scoring = 'neg_mean_squared_error', cv =3, verbose = 3)

In [None]:
tuning_model.fit(x_train, y_train)

In [None]:
tuning_model.best_params_

In [None]:
final_model = DecisionTreeClassifier(max_depth=9,max_features ='sqrt',max_leaf_nodes=80,min_samples_leaf = 4,min_weight_fraction_leaf=0.1,splitter='best')
final_model.fit(x_train,y_train)
pred = final_model.predict(x_test)
acc = accuracy_score(y_test,pred)
print(acc*100)

# AUC ROC Curve

In [None]:
from sklearn import datasets
from sklearn import metrics
from sklearn import model_selection
from sklearn.metrics import plot_roc_curve

disp = plot_roc_curve(dtc,x_test,y_test)
plot_roc_curve(RFC, x_test,y_test, ax = disp.ax_)

plot_roc_curve(svc, x_test, y_test, ax = disp.ax_)
plot_roc_curve(GB, x_test,y_test,ax = disp.ax_)

plt.legend(prop={'size':11},loc = 'lower right')
plt.show()

# saving the model

In [None]:
import pickle
filename = 'wine_dataset.pkl'
pickle.dump(final_model,open(filename,'wb'))

# conclusion

In [None]:
b = np.array(y_test)
predicted= np.array(lr.predict(x_test))
df_com =pd.DataFrame({'original':b,'predicted':predicted},index=range(len(b)))
df_com

# Recomendation

1. Random Forest classification model have the highest accuracy.
2. Support vector classification has the least difference between the cross validation score and the accuracy score
2. Hyperparameter tunning must be done with support vector model to see if we get a higher accuracy or better than Decission Tree