In [1]:
import pandas
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [2]:
df = pandas.read_csv('../Datasets/winequality-red.csv',sep=";")
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [3]:
X = df.values[:,:-1]
Y = df.values[:,-1]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.30, random_state=42)
print('X',X.shape)
print('Xtrain',Xtrain.shape)
print('Xtest',Xtest.shape)

X (1599, 11)
Xtrain (1119, 11)
Xtest (480, 11)


In [4]:
mcorr = np.corrcoef(df.values.transpose())
mcorr = np.abs(np.round(mcorr,3))
dfmcorr = pandas.DataFrame(mcorr,index=df.columns,columns=df.columns)
dfmcorr

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
fixed acidity,1.0,0.256,0.672,0.115,0.094,0.154,0.113,0.668,0.683,0.183,0.062,0.124
volatile acidity,0.256,1.0,0.552,0.002,0.061,0.011,0.076,0.022,0.235,0.261,0.202,0.391
citric acid,0.672,0.552,1.0,0.144,0.204,0.061,0.036,0.365,0.542,0.313,0.11,0.226
residual sugar,0.115,0.002,0.144,1.0,0.056,0.187,0.203,0.355,0.086,0.006,0.042,0.014
chlorides,0.094,0.061,0.204,0.056,1.0,0.006,0.047,0.201,0.265,0.371,0.221,0.129
free sulfur dioxide,0.154,0.011,0.061,0.187,0.006,1.0,0.668,0.022,0.07,0.052,0.069,0.051
total sulfur dioxide,0.113,0.076,0.036,0.203,0.047,0.668,1.0,0.071,0.066,0.043,0.206,0.185
density,0.668,0.022,0.365,0.355,0.201,0.022,0.071,1.0,0.342,0.149,0.496,0.175
pH,0.683,0.235,0.542,0.086,0.265,0.07,0.066,0.342,1.0,0.197,0.206,0.058
sulphates,0.183,0.261,0.313,0.006,0.371,0.052,0.043,0.149,0.197,1.0,0.094,0.251


In [5]:
vcorr = mcorr[:-1,-1]

idx=np.argsort(vcorr)

names = df.columns[:-1][idx][::-1]
vcorr = vcorr[idx][::-1]

for i,id in enumerate(idx):
    print(id,names[i],vcorr[i])

3 alcohol 0.476
5 volatile acidity 0.391
8 sulphates 0.251
0 citric acid 0.226
4 total sulfur dioxide 0.185
7 density 0.175
6 chlorides 0.129
2 fixed acidity 0.124
9 pH 0.058
1 free sulfur dioxide 0.051
10 residual sugar 0.014


In [6]:
# Modelo con una variable
model = LinearRegression()
model.fit( np.reshape(Xtrain[:,10],(1119,1)), ytrain )
ypred = model.predict( np.reshape(Xtest[:,10],(480,1)) )
print(df.columns[10])
print('MSE',mean_squared_error(ytest,ypred))

alcohol
MSE 0.5165020523532148


In [7]:
# Modelo con dos variable
model = LinearRegression()
model.fit( Xtrain[:,[10,1]], ytrain )
ypred = model.predict( Xtest[:,[10,1]] )
print(df.columns[[10,1]])
print('MSE',mean_squared_error(ytest,ypred))

Index(['alcohol', 'volatile acidity'], dtype='object')
MSE 0.446870168793296


In [8]:
# Modelo con 3 variable
model = LinearRegression()
model.fit( Xtrain[:,[10,1,9]], ytrain )
ypred = model.predict( Xtest[:,[10,1,9]] )
print(df.columns[[10,1,9]])
print('MSE',mean_squared_error(ytest,ypred))

Index(['alcohol', 'volatile acidity', 'sulphates'], dtype='object')
MSE 0.4285649950695498


In [9]:
# Modelo con 4 variable
model = LinearRegression()
model.fit( Xtrain[:,[10,1,9,2]], ytrain )
ypred = model.predict( Xtest[:,[10,1,9,2]] )
print(df.columns[[10,1,9,2]])
print('MSE',mean_squared_error(ytest,ypred))

Index(['alcohol', 'volatile acidity', 'sulphates', 'citric acid'], dtype='object')
MSE 0.4320410860150909


In [10]:
# Modelo con 5 variable
model = LinearRegression()
model.fit( Xtrain[:,[10,1,9,2,6]], ytrain )
ypred = model.predict( Xtest[:,[10,1,9,2,6]] )
print(df.columns[[10,1,9,2,6]])
print('MSE',mean_squared_error(ytest,ypred))

Index(['alcohol', 'volatile acidity', 'sulphates', 'citric acid',
       'total sulfur dioxide'],
      dtype='object')
MSE 0.4268731742603873


In [11]:
# Modelo con 5 variable
model = LinearRegression()
model.fit( Xtrain[:,[10,1,9,2,6,7]], ytrain )
ypred = model.predict( Xtest[:,[10,1,9,2,6,7]] )
print(df.columns[[10,1,9,2,6,7]])
print('MSE',mean_squared_error(ytest,ypred))

Index(['alcohol', 'volatile acidity', 'sulphates', 'citric acid',
       'total sulfur dioxide', 'density'],
      dtype='object')
MSE 0.426500524426931


In [12]:
# Modelo con 5 variable
model = LinearRegression()
model.fit( Xtrain[:,[10,1,9,2,6,7,4]], ytrain )
ypred = model.predict( Xtest[:,[10,1,9,2,6,7,4]] )
print(df.columns[[10,1,9,2,6,7,4]])
print('MSE',mean_squared_error(ytest,ypred))

Index(['alcohol', 'volatile acidity', 'sulphates', 'citric acid',
       'total sulfur dioxide', 'density', 'chlorides'],
      dtype='object')
MSE 0.41987972275751095


In [13]:
# Modelo con 5 variable
model = LinearRegression()
model.fit( Xtrain[:,[10,1,9,2,6,7,4,0]], ytrain )
ypred = model.predict( Xtest[:,[10,1,9,2,6,7,4,0]] )
print(df.columns[[10,1,9,2,6,7,4,0]])
print('MSE',mean_squared_error(ytest,ypred))

Index(['alcohol', 'volatile acidity', 'sulphates', 'citric acid',
       'total sulfur dioxide', 'density', 'chlorides', 'fixed acidity'],
      dtype='object')
MSE 0.415916086491634


In [14]:
# Modelo con 5 variable
model = LinearRegression()
model.fit( Xtrain[:,[10,1,9,2,6,7,4,0,8]], ytrain )
ypred = model.predict( Xtest[:,[10,1,9,2,6,7,4,0,8]] )
print(df.columns[[10,1,9,2,6,7,4,0,8]])
print('MSE',mean_squared_error(ytest,ypred))

Index(['alcohol', 'volatile acidity', 'sulphates', 'citric acid',
       'total sulfur dioxide', 'density', 'chlorides', 'fixed acidity', 'pH'],
      dtype='object')
MSE 0.4130459292923016


In [15]:
# Modelo con 5 variable
model = LinearRegression()
model.fit( Xtrain[:,[10,1,9,2,6,7,4,0,8,5]], ytrain )
ypred = model.predict( Xtest[:,[10,1,9,2,6,7,4,0,8,5]] )
print(df.columns[[10,1,9,2,6,7,4,0,8,5]])
print('MSE',mean_squared_error(ytest,ypred))

Index(['alcohol', 'volatile acidity', 'sulphates', 'citric acid',
       'total sulfur dioxide', 'density', 'chlorides', 'fixed acidity', 'pH',
       'free sulfur dioxide'],
      dtype='object')
MSE 0.4117854658657059


In [16]:
# Modelo con 5 variable
model = LinearRegression()
model.fit( Xtrain[:,[10,1,9,2,6,7,4,0,8,5,3]], ytrain )
ypred = model.predict( Xtest[:,[10,1,9,2,6,7,4,0,8,5,3]] )
print(df.columns[[10,1,9,2,6,7,4,0,8,5,3]])
print('MSE',mean_squared_error(ytest,ypred))

Index(['alcohol', 'volatile acidity', 'sulphates', 'citric acid',
       'total sulfur dioxide', 'density', 'chlorides', 'fixed acidity', 'pH',
       'free sulfur dioxide', 'residual sugar'],
      dtype='object')
MSE 0.411234871750419
