In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [2]:
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url, sep=';')

In [3]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [9]:
data.isnull().any()

fixed acidity           False
volatile acidity        False
citric acid             False
residual sugar          False
chlorides               False
free sulfur dioxide     False
total sulfur dioxide    False
density                 False
pH                      False
sulphates               False
alcohol                 False
quality                 False
dtype: bool

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
fixed acidity           1599 non-null float64
volatile acidity        1599 non-null float64
citric acid             1599 non-null float64
residual sugar          1599 non-null float64
chlorides               1599 non-null float64
free sulfur dioxide     1599 non-null float64
total sulfur dioxide    1599 non-null float64
density                 1599 non-null float64
pH                      1599 non-null float64
sulphates               1599 non-null float64
alcohol                 1599 non-null float64
quality                 1599 non-null int64
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [11]:
n_wines = data.shape[0]

# Number of wines with quality rating above 6
quality_above_6 = data.loc[(data['quality'] > 6)]
n_above_6 = quality_above_6.shape[0]

# Number of wines with quality rating below 5
quality_below_5 = data.loc[(data['quality'] < 5)]
n_below_5 = quality_below_5.shape[0]

# Number of wines with quality rating between 5 to 6
quality_between_5 = data.loc[(data['quality'] >= 5) & (data['quality'] <= 6)]
n_between_5 = quality_between_5.shape[0]

# Percentage of wines with quality rating above 6
greater_percent = n_above_6*100/n_wines

# Print the results
print("Total number of wine data: {}".format(n_wines))
print("Wines with rating 7 and above: {}".format(n_above_6))
print("Wines with rating less than 5: {}".format(n_below_5))
print("Wines with rating 5 and 6: {}".format(n_between_5))
print("Percentage of wines with quality 7 and above: {:.2f}%".format(greater_percent))

# Some more additional data analysis
display(np.round(data.describe()))

Total number of wine data: 1599
Wines with rating 7 and above: 217
Wines with rating less than 5: 63
Wines with rating 5 and 6: 1319
Percentage of wines with quality 7 and above: 13.57%


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.0,1.0,0.0,3.0,0.0,16.0,46.0,1.0,3.0,1.0,10.0,6.0
std,2.0,0.0,0.0,1.0,0.0,10.0,33.0,0.0,0.0,0.0,1.0,1.0
min,5.0,0.0,0.0,1.0,0.0,1.0,6.0,1.0,3.0,0.0,8.0,3.0
25%,7.0,0.0,0.0,2.0,0.0,7.0,22.0,1.0,3.0,1.0,10.0,5.0
50%,8.0,1.0,0.0,2.0,0.0,14.0,38.0,1.0,3.0,1.0,10.0,6.0
75%,9.0,1.0,0.0,3.0,0.0,21.0,62.0,1.0,3.0,1.0,11.0,6.0
max,16.0,2.0,1.0,16.0,1.0,72.0,289.0,1.0,4.0,2.0,15.0,8.0


In [4]:
y = data.quality
X = data.drop('quality', axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)

In [6]:
print(X_train.head())

      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
1285           11.3              0.37         0.50             1.8      0.090   
796             8.7              0.46         0.31             2.5      0.126   
744            11.1              0.39         0.54             2.7      0.095   
1113            8.9              0.24         0.39             1.6      0.074   
798             9.4              0.50         0.34             3.6      0.082   

      free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
1285                 20.0                  47.0  0.99734  3.15       0.57   
796                  24.0                  64.0  0.99746  3.10       0.74   
744                  21.0                 101.0  1.00010  3.13       0.51   
1113                  3.0                  10.0  0.99698  3.12       0.59   
798                   5.0                  14.0  0.99870  3.29       0.52   

      alcohol  
1285     10.5  
796       9.6  
74

In [8]:
X_train_scaled = preprocessing.scale(X_train)
print(X_train_scaled)

[[ 1.73679445 -0.89313001  1.23292561 ... -1.06272331 -0.52313091
   0.07906346]
 [ 0.23153336 -0.38593615  0.23541178 ... -1.38582475  0.52966762
  -0.76349127]
 [ 1.62100514 -0.78042026  1.44292852 ... -1.19196389 -0.89470686
  -0.85710846]
 ...
 [ 1.79468911  0.40303208  1.18042488 ... -1.06272331 -0.15155495
   0.07906346]
 [ 0.57890131 -1.45667874  0.76041906 ... -0.22265958  0.28195032
  -0.66987408]
 [-1.8526743   0.31849977 -1.39211077 ...  1.58670845  1.76825412
   2.32587609]]


In [12]:
from sklearn.linear_model import LinearRegression 
from sklearn import metrics 

# fitting linear regression to training data
regressor = LinearRegression()
regressor.fit(X_train_scaled,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [13]:
# this gives the coefficients of the 10 features selected above.  
print(regressor.coef_)

[ 0.05046056 -0.20854288 -0.05284679  0.03439729 -0.07374496  0.03485085
 -0.09220111 -0.03632731 -0.05771505  0.15263609  0.3153355 ]


In [34]:
print(regressor.intercept_)

5.6497263487099305


In [16]:
X_test_scaled=preprocessing.scale(X_test)
test_pred = regressor.predict(X_test_scaled) 
print(test_pred)

[6.52075938 6.34157545 5.3068193  5.14638284 5.62565839 5.01146983
 4.79450366 6.06784485 5.17574398 6.69577719 5.53066532 5.53619384
 5.78670767 6.46324611 6.22664839 5.80835394 5.6715315  5.60823377
 5.68285973 5.31526969 5.09173697 4.98776552 5.1968576  5.2151633
 5.32589739 5.68463919 5.2335759  5.09106412 5.43213947 5.58678374
 6.15591477 5.7933292  6.32005535 5.13185824 5.37463353 5.07391699
 5.51338337 5.24524229 6.27847921 5.84619797 5.23443658 6.02123459
 5.08679428 5.92359761 5.67849319 5.17985853 5.07597838 6.19996671
 5.40417857 6.30707873 5.30336896 5.10000212 6.53539227 5.49118803
 5.2526696  5.36738382 6.17376506 5.07111611 5.25083874 5.5119217
 5.6740522  6.40840386 6.50655691 5.91405689 6.20748351 6.76974767
 5.6481569  5.72585669 5.6600919  5.89426104 6.82594581 5.62007525
 5.87077904 5.20731862 5.47115262 5.83965486 5.96251573 5.6715315
 5.64577232 6.26324844 5.55271201 5.27591161 5.21665428 4.9284574
 5.23983375 6.06490754 6.6752409  5.1941941  6.59437074 6.0014763


In [18]:
test_rmse = metrics.mean_squared_error(test_pred, y_test) ** 0.5
print(test_rmse)

0.6641228143765028


In [21]:
# rounding off the predicted values for test set
predicted_data = np.round_(test_pred)
print(predicted_data)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, test_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, test_pred))
print('Root Mean Squared Error:',np.sqrt(metrics.mean_squared_error(y_test, test_pred)))

[7. 6. 5. 5. 6. 5. 5. 6. 5. 7. 6. 6. 6. 6. 6. 6. 6. 6. 6. 5. 5. 5. 5. 5.
 5. 6. 5. 5. 5. 6. 6. 6. 6. 5. 5. 5. 6. 5. 6. 6. 5. 6. 5. 6. 6. 5. 5. 6.
 5. 6. 5. 5. 7. 5. 5. 5. 6. 5. 5. 6. 6. 6. 7. 6. 6. 7. 6. 6. 6. 6. 7. 6.
 6. 5. 5. 6. 6. 6. 6. 6. 6. 5. 5. 5. 5. 6. 7. 5. 7. 6. 6. 6. 6. 6. 6. 5.
 6. 5. 5. 5. 5. 6. 6. 6. 5. 6. 6. 6. 5. 5. 6. 6. 6. 5. 5. 5. 6. 6. 6. 6.
 5. 6. 5. 5. 6. 6. 5. 6. 6. 5. 5. 5. 6. 6. 5. 5. 6. 5. 5. 5. 6. 6. 6. 6.
 5. 6. 5. 6. 5. 6. 6. 7. 6. 6. 6. 6. 5. 5. 6. 6. 5. 6. 6. 6. 6. 5. 5. 6.
 6. 5. 6. 5. 6. 6. 5. 5. 6. 5. 5. 6. 5. 5. 5. 6. 6. 6. 6. 5. 5. 5. 5. 7.
 6. 5. 8. 5. 7. 6. 5. 7. 5. 5. 5. 6. 6. 5. 6. 5. 5. 5. 6. 5. 5. 6. 6. 7.
 5. 5. 6. 5. 5. 5. 6. 5. 5. 5. 6. 6. 5. 6. 6. 5. 6. 5. 6. 6. 5. 6. 5. 6.
 6. 5. 5. 5. 5. 6. 5. 5. 5. 6. 6. 6. 6. 6. 6. 6. 6. 6. 5. 6. 5. 5. 5. 6.
 6. 5. 6. 5. 5. 6. 6. 7. 6. 6. 6. 6. 6. 6. 6. 6. 6. 5. 5. 6. 5. 5. 5. 6.
 6. 5. 6. 6. 5. 6. 6. 5. 6. 6. 6. 6. 6. 6. 5. 6. 5. 6. 5. 6. 6. 6. 5. 6.
 5. 6. 6. 5. 6. 6. 6. 5.]
Mean Absolute Error: 0.51

In [25]:
features = list(data.columns.values.tolist()) 

In [26]:
features

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

In [29]:
del features[-1]

In [30]:
features

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol']

In [31]:
# displaying coefficients of each feature
coeffecients = pd.DataFrame(regressor.coef_,features) 
coeffecients.columns = ['Coeffecient'] 
print(coeffecients)

                      Coeffecient
fixed acidity            0.050461
volatile acidity        -0.208543
citric acid             -0.052847
residual sugar           0.034397
chlorides               -0.073745
free sulfur dioxide      0.034851
total sulfur dioxide    -0.092201
density                 -0.036327
pH                      -0.057715
sulphates                0.152636
alcohol                  0.315335


In [33]:
df_test = pd.DataFrame({'Actual': y_test, 'Predicted': test_pred})
df_test1 = df_test.head(25)
df_test1

Unnamed: 0,Actual,Predicted
1003,7,6.520759
1125,7,6.341575
1226,5,5.306819
688,5,5.146383
1027,5,5.625658
1589,5,5.01147
1385,5,4.794504
1237,6,6.067845
753,5,5.175744
1114,6,6.695777


In [44]:
# calculate r2 score
from sklearn.metrics import r2_score
r2 = regressor.score(X_train_scaled, y_train)

# adjusted r2 using formula adj_r2 = 1 - (1- r2) * (n-1) / (n - k - 1)
# k = number of predictors = X_train_scaled.shape[1] - 1
adj_r2 = 1 - (1-r2)*(len(X_train_scaled) - 1) / (len(X_train_scaled) - (X_train_scaled.shape[1] - 1) - 1)
print(r2, adj_r2)

0.38162887513784427 0.3767521312509188
