<h1 align="center">Trabalho 3 - Aprendizagem de Máquina</h1>

## Andre Brun
### Daniel Boll & Mateus Karvat
---

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns; sns.set()

import time

# Analitics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr 

# Linear Models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

# Classifiers
from sklearn.svm import SVR

# Regressors
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

# Configurations
style.use('ggplot')
%matplotlib qt
np.set_printoptions(precision=3, suppress=True)
pd.set_option("display.precision", 3)

In [None]:
data = pd.read_csv('./winequality-red.csv')

# Manteremos uma cópia dos dados originais
# para garantia
raw_data = data.copy()

In [None]:
data.head()

In [None]:
data['quality'].value_counts()

In [None]:
X = data.values[:, :-1]
y = data.values[:, -1]
attributes_size = np.shape(X)[1]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [None]:
each_r = np.zeros((attributes_size, 1))
n = np.shape(X)[0]
k = np.shape(X)[1]
constant = ((n - 1) / (n - k - 1))
for i in range(np.shape(X)[1]):
    rsquared = pearsonr(X[:, i], y)[0] ** 2
    each_r[i] = (1 - (1 - rsquared) * constant)

In [63]:
each_r

array([[ 0.009],
       [ 0.147],
       [ 0.045],
       [-0.007],
       [ 0.01 ],
       [-0.004],
       [ 0.028],
       [ 0.024],
       [-0.004],
       [ 0.057],
       [ 0.221]])

In [None]:
plt.figure(figsize=(18, 15))
plt.subplots_adjust(hspace=0.40, wspace=0.3)

for i in range(attributes_size):
    coluna = X[:, i]

    current_figure = plt.subplot(3, 4, i+1)
    plt.plot(y, coluna, 'b.') 
    current_figure.set_title(f"{data.columns[i].title()}\n R² ajustado = {np.round(each_r[i][0], 3)}")

plt.suptitle("Correlação de cada atributo com a qualidade", size=30)
plt.show()

*TODO*: Análise coeficiente

In [None]:
attributes = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide','total sulfur dioxide', 'density', 'pH', 'sulphates','alcohol', 'quality']

correlations = data[attributes].corr()

mask = np.zeros_like(data[attributes].corr(), dtype=np.bool) 
mask[np.triu_indices_from(mask)] = True 

f, ax = plt.subplots(figsize=(12, 9))
plt.title("Correlação entre atributos", fontsize=25)

sns.heatmap(correlations, linewidths=.25, vmax=.7, square=True, cmap="YlGn", linecolor="w", annot=True, annot_kws={"size": 13}, mask=mask, cbar_kws={"shrink": .9})

plt.tight_layout()

*TODO*: Análise de correlação entre atributos

----
Classificadores: 
- MultLinReg
- SVM (SRV)
- MLP
- Random Forest

---
### Multiple Linear Regression

In [72]:
mlr = LinearRegression().fit(x_train, y_train)
y_pred = mlr.predict(x_test)

mlr_msq = np.sqrt(mean_squared_error(y_test, y_pred))
mlr_sse = np.sum((y_test - y_pred)**2)

In [75]:
print(" "*10+"MLR")
print("="*23)
print("Root Mean Squared Error")
print("-"*23)
print(f"   {mlr_msq}")
print("="*23)
print("*"*23)
print("="*23)
print("Sum of Squared Errors")
print("-"*23)
print(f"   {mlr_sse}")
print("="*23)

          MLR
Root Mean Squared Error
-----------------------
   0.6429444274500508
***********************
Sum of Squared Errors
-----------------------
   198.4212176587553


---
## SVM (Support Vector Machine) 
### SVR

In [73]:
svr = SVR(kernel="linear").fit(x_train, y_train)
y_pred = svr.predict(x_test)
svr_msq = np.sqrt(mean_squared_error(y_test, y_pred))
svr_sse = np.sum((y_test - y_pred)**2)

In [76]:
print(" "*10+"SVR")
print("="*23)
print("Root Mean Squared Error")
print("-"*23)
print(f"   {svr_msq}")
print("="*23)
print("*"*23)
print("="*23)
print("Sum of Squared Errors")
print("-"*23)
print(f"   {svr_sse}")
print("="*23)

          SVR
Root Mean Squared Error
-----------------------
   0.6406950494931516
***********************
Sum of Squared Errors
-----------------------
   197.03527029361533


---
## MLP (Multi Layer Perceptron)


In [74]:
mlp = MLPRegressor(
    hidden_layer_sizes = (256, 168, 64, 32),
    solver="lbfgs",
    activation="tanh",
    max_iter=1500,
    alpha=0,
    ).fit(x_train, y_train)
y_pred = mlp.predict(x_test)

mlp_msq = np.sqrt(mean_squared_error(y_test, y_pred))
mlp_sse = np.sum((y_test - y_pred)**2)

In [77]:
print(" "*10+"MLP")
print("="*23)
print("Root Mean Squared Error")
print("-"*23)
print(f"   {mlp_msq}")
print("="*23)
print("*"*23)
print("="*23)
print("Sum of Squared Errors")
print("-"*23)
print(f"   {mlp_sse}")
print("="*23)

          MLP
Root Mean Squared Error
-----------------------
   0.6859204040910275
***********************
Sum of Squared Errors
-----------------------
   225.83366435923125


---
## Random Forest Regressor

In [78]:
rfr = RandomForestRegressor(n_estimators=300, n_jobs=-1).fit(x_train, y_train)
y_pred = rfr.predict(x_test)

rfr_msq = np.sqrt(mean_squared_error(y_test, y_pred))
rfr_sse = np.sum((y_test - y_pred)**2)

In [79]:
print(" "*10+"RFR")
print("="*23)
print("Root Mean Squared Error")
print("-"*23)
print(f"   {rfr_msq}")
print("="*23)
print("*"*23)
print("="*23)
print("Sum of Squared Errors")
print("-"*23)
print(f"   {rfr_sse}")
print("="*23)

          RFR
Root Mean Squared Error
-----------------------
   0.5812441158603002
***********************
Sum of Squared Errors
-----------------------
   162.16546666666665


---

In [80]:
table = [
    [mlr_msq, svr_msq, mlp_msq, rfr_msq],
    [mlr_sse, svr_sse, mlp_sse, rfr_sse]
]
result = pd.DataFrame(table,
    index=["MSQ", "SSE"],
    columns=["MLR", "SVR", "MLP", "RFR"]
    )
result.style.highlight_min(color = "forestgreen", axis=1)

Unnamed: 0,MLR,SVR,MLP,RFR
MSQ,0.643,0.641,0.686,0.581
SSE,198.421,197.035,225.834,162.165
