In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.stats import normaltest, skew


for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
data = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv') 
data.head()

In [2]:
data.quality.unique()

In [3]:
corr_mat = data.corr(method='pearson')
plt.figure(figsize=(20,10))
sns.heatmap(corr_mat,vmax=1,square=True,annot=True,cmap='cubehelix')

In [4]:
sns.histplot(x='alcohol', data=data, kde=True, hue="quality")


In [5]:
sns.distplot(np.log(data['fixed acidity']))



In [6]:
pd.options.display.float_format = '{:.2f}%'.format ## Set a percentage view as a default for Float
def missing_values(n):
    df=pd.DataFrame()
    df["missing, %"]=data.isnull().sum()*100/len(data.isnull())
    df["missing, num"]=data.isnull().sum()
    return df.sort_values(by="missing, %", ascending=False)
missing_values(data)

In [7]:
#target_correlation(df_red, target)

def explore_features(df, features, alpha=0.05):
    """ """
    for feature in features:
        fig, ax = plt.subplots(ncols=2, figsize=(16,3))
        fig.suptitle(f'Feature: {feature}', fontsize=14)

        # Boxplot
        sns.boxplot(data=df, x=feature, ax=ax[0], showmeans=True)
        ax[0].set_title('Location and Variability: Box-Plot')

        # Checking for outliers
        Q1 = df[feature].quantile(0.25)
        Q3 = df[feature].quantile(0.75)
        IQR = Q3 - Q1
        outliers = (df[feature][(Q1-1.5*IQR >= df[feature]) 
                                | (df[feature] >= Q3+1.5*IQR)])
        if outliers.empty:
            has_outliers = 'No'
        else:
            has_outliers = 'Yes'

        # Presenting the stats
        ax[0].annotate(text=f'Mean: {df[feature].mean():.3f}\n'
                            f'Median: {df[feature].median():.3f}\n'
                            f'StdDev: {df[feature].std():.3f}\n'
                            f'Outliers: {has_outliers}',
                       xy=(1,1), xytext=(-12,-12),
                       xycoords='axes fraction', textcoords='offset points',
                       ha='right', va='top',
                       bbox=dict(facecolor='grey', alpha=0.2, pad=12))

        # Histogram
        sns.histplot(data=df, x=feature, ax=ax[1], kde=True)
        ax[1].set_title('Distribution: Histogram')
        ax[1].yaxis.label.set_visible(False)

        # Normality stats
        _, pvalue = normaltest(df[feature])
        skewness = skew(df[feature])

        # Normality test
        if pvalue < alpha:  
            normal = 'No'
        else:
            normal = 'Yes'

        # Presenting the stats
        ax[1].annotate(text=f'Normal: {normal}\n'
                            f'p-vaue: {pvalue:.3f}\n'
                            f'Skew: {skewness:.3f}',
                       xy=(1,1), xytext=(-12,-12),
                       xycoords='axes fraction', textcoords='offset points',
                       ha='right', va='top',
                       bbox=dict(facecolor='grey', alpha=0.2, pad=12))

        plt.show()
    

In [8]:
features=['fixed acidity', 'volatile acidity', 'citric acid', 
            'residual sugar', 'chlorides', 'free sulfur dioxide', 
            'total sulfur dioxide', 'density', 'pH', 
            'sulphates', 'alcohol']

In [9]:
explore_features(data,features)

In [10]:
target = 'quality'

In [11]:
fig, ax = plt.subplots(figsize=(16,2))
    
data = data.corr().loc[[target],:].drop(target, axis=1)
sns.heatmap(data=data, cmap='cividis', annot=True, ax=ax)
ax.set_title(f'Correlation (Pearson) of the features x {target}', fontsize=14)
ax.set_xlabel(f'Features')
ax.tick_params(axis='x', labelrotation=45)
    
plt.show()

In [12]:
from sklearn import metrics

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix


Regression models do not use accuracy like classification models. Instead different metrics are computed such as, mean square error or coefficient of determination. These metrics can show how accurately predicted values match known values or how closely a regression model fits a regression line.

User regressions

In [13]:
!pip install pyforest

In [14]:
pip install lazypredict

In [15]:
# Importing important libraries
import pyforest
import lazypredict
from lazypredict.Supervised import LazyRegressor
from pandas.plotting import scatter_matrix

# Scikit-learn packages
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import metrics
from sklearn.metrics import mean_squared_error

# Setting up max columns displayed to 100
pd.options.display.max_columns = 100

In [16]:
data = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv') 
data.head()

In [17]:
data.info()

In [18]:
data.describe()

In [19]:
# Removing nan and infinite values
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)

In [20]:
data.head()

In [21]:
x=data.drop(columns='quality')
y=data.quality
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=3,test_size=0.25)

In [22]:
reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)
models, predictions = reg.fit(x_train, x_test, y_train, y_test)
#print(models)

In [23]:
models

In [24]:
predictions

In [25]:
# Double Checking Results
# Explicitly require this experimental feature
from sklearn.experimental import enable_hist_gradient_boosting
# Now you can import normally from ensemble
from sklearn.ensemble import HistGradientBoostingRegressor

In [26]:
 #Evaluation Functions
def rmse(model, y_test, y_pred, x_train, y_train):
    r_squared = model.score(x_test, y_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    print("R-squared: " + str(r_squared))
    print("Mean Squared Error: "+ str(rmse))


In [27]:
# Create model line scatter plot
def scatter_plot(y_test, y_pred, model_name):
    plt.figure(figsize=(10,6))
    sns.residplot(y_test, y_pred, lowess=True, color='#4682b4',
              line_kws={'lw': 2, 'color': 'r'})
    plt.title(str('Quality vs Residuals for '+ model_name))
    plt.xlabel('Quality',fontsize=16)
    plt.xticks(fontsize=13)
    plt.yticks(fontsize=13)
    plt.show()

In [28]:
# Histogram-based Gradient Boosting Regression Tree
hist = HistGradientBoostingRegressor()
hist.fit(x_train, y_train)
y_pred = hist.predict(x_test)

In [29]:
rmse(hist, y_test,y_pred, x_train, y_train)

In [30]:
scatter_plot(y_test, y_pred, 'Histrogram-based Gradient Boosting Regression Tree')

In [31]:
# GradientBoostingRegressor	
from sklearn.ensemble import GradientBoostingRegressor
mod = GradientBoostingRegressor()
mod.fit(x_train, y_train)
y_pred = hist.predict(x_test)

In [32]:
rmse(mod, y_test,y_pred, x_train, y_train)

In [33]:
# define the model
mod = GradientBoostingRegressor()
# fit the model on the whole dataset
mod.fit(x_train, y_train)
# make a single prediction
row=[7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.9968,3.20,0.68,9.8]
yhat = mod.predict([row])
# summarize prediction
print('Prediction: %d' % yhat[0])

In [34]:
scatter_plot(y_test, y_pred, ' Gradient Boosting Regression Tree')

In [35]:
# Double Checking Multiple Linear Regression

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)
y_pred = regressor.predict(x_test)

In [36]:
rmse(regressor, y_test,y_pred, x_train, y_train)

In [37]:
scatter_plot(y_test, y_pred, ' Mulptiple Linear Regression')

In [38]:
# Decision Tree Regression
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(x_train, y_train)
y_pred = regressor.predict(x_test)
rmse(regressor, y_test,y_pred, x_train, y_train)

In [41]:
score = regressor.score(x_train, y_train)
print("R-squared:", score)
ypred = regressor.predict(x_test)

mse = mean_squared_error(y_test, ypred)
print("MSE: " + str( mse))
print("RMSE: ", mse*(1/2.0)) 

In [None]:
## Random Forest
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor()
regressor.fit(x_train, y_train)
#rmse(regressor, y_test,y_pred, x_train, y_train)

In [None]:
score=regressor.score(x_train, y_train)
print("R-squared: ", score)

In [None]:
y_pred = regressor.predict(x_test)
mse=mean_squared_error(y_test, y_pred)
print('MSE: ', mse)
#print('RMSE: ', mse*(1/2.0))

In [None]:
df=pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv') 
df.head()

In [None]:
# Make prediction
df=pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv') 
x=df.drop(columns='quality')
y=df.quality
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=3,test_size=0.25)
model = RandomForestRegressor(n_estimators = 10, random_state = 0)
model.fit(x_train, y_train)
y_pred = regressor.predict(x_test)
y_test['preds'] = y_pred

y_hats_df = pd.DataFrame(data = y_pred, columns = ['y_pred'], index = x_test.index.copy())
df_out = pd.merge(df, y_hats_df, how = 'left', left_index = True, right_index = True)

In [None]:
df_out