In [264]:
print("Hello World")

Hello World


In [265]:
# Import necessary libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error

In [266]:
# Load train and validation datasets
df_train = pd.read_csv("data_files/train.csv")
df_validation = pd.read_csv("data_files/validation.csv")

In [267]:
df_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,6.8,0.77,0.0,1.8,0.066,34.0,52.0,0.9976,3.62,0.68,9.9,5
1,7.3,0.58,0.3,2.4,0.074,15.0,55.0,0.9968,3.46,0.59,10.2,5
2,9.8,0.25,0.49,2.7,0.088,15.0,33.0,0.9982,3.42,0.9,10.0,6
3,11.4,0.625,0.66,6.2,0.088,6.0,24.0,0.9988,3.11,0.99,13.3,6
4,7.1,0.72,0.0,1.8,0.123,6.0,14.0,0.99627,3.45,0.58,9.8,5


In [268]:
df_validation.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,8.4,0.635,0.36,2.0,0.089,15.0,55.0,0.99745,3.31,0.57,10.4,4
1,8.3,0.49,0.36,1.8,0.222,6.0,16.0,0.998,3.18,0.6,9.5,6
2,9.0,0.58,0.25,2.0,0.104,8.0,21.0,0.99769,3.27,0.72,9.6,5
3,8.1,0.78,0.1,3.3,0.09,4.0,13.0,0.99855,3.36,0.49,9.5,5
4,7.5,0.38,0.48,2.6,0.073,22.0,84.0,0.9972,3.32,0.7,9.6,4


In [269]:
# Dividing training data into input features and target features
X_train = df_train.drop('quality', axis = 1)
y_train = df_train['quality']

In [270]:
# Dividing validation data into input features and target features
X_validation = df_validation.drop('quality', axis = 1)
y_validation = df_validation['quality']

In [271]:
# Function for LinearRegression Model
def linear_regression(X_train, y_train, X_validation, y_validation):
    # Defining the model
    model = LinearRegression()

    # Fitting the model
    model.fit(X_train, y_train)

    # Predicting
    y_pred = model.predict(X_validation)

    print(f"Using Linear Regression Model: ")

    # Error
    print(f"MSE: {mean_squared_error(y_validation, y_pred)}")
    print(f"RMSE: {root_mean_squared_error(y_validation, y_pred)}")

    # R2 Score
    print(f"R2 Score: {r2_score(y_validation, y_pred)}")

In [272]:
# Not wrangling or EDA
print("Without any data wrangling or EDA:")
linear_regression(X_train, y_train, X_validation, y_validation)

Without any data wrangling or EDA:
Using Linear Regression Model: 
MSE: 0.42766180439680423
RMSE: 0.6539585647400027
R2 Score: 0.34485415639050987


In [273]:
# Function to plot histplot of all features of a dataframe
def hist_plot_all(df, data):
    print(f"Visualize Histplot for {data} data")
    for name in df:
        print(f"{name}: ")
        sns.histplot(df[name], kde = True)
        plt.show()



In [274]:
# Function to plot boxplot of all features of a dataframe
def box_plot_all(df, data):
    print(f"Visualizing boxplots of {data} data: ")
    for name in df:
        print(f"{name}: ")
        sns.boxplot(df[name], orient = "h")
        plt.show()

In [275]:
# hist_plot_all(X_train, "train")

In [276]:
# hist_plot_all(X_validation, "validation")

In [277]:
# box_plot_all(X_train, "train")

In [278]:
# box_plot_all(X_validation, "validation")

In [279]:
# Calculate skewness of all features in a dataframe
def skew_calc(df):
    skew_vals_rev = {df[name].skew():name for name in df}
    skew_vals_rev_sorted = dict(sorted(skew_vals_rev.items()))
    skew_vals = {value:key for key, value in skew_vals_rev_sorted.items()}
    print(f"Skewness of all features in given dataframe: ")
    for key, value in skew_vals.items():
        print(f"{key} = {value}")

In [280]:
# Calculate skewness of features in X_train, print in increasing order
skew_calc(X_train)

Skewness of all features in given dataframe: 
density = -0.0721368587006527
pH = 0.15605144433933577
citric acid = 0.29487725679585225
volatile acidity = 0.7215297851296966
alcohol = 0.9518607069483428
fixed acidity = 1.0062538933982945
free sulfur dioxide = 1.3886754658770064
total sulfur dioxide = 1.4384967043965162
sulphates = 2.35648257662985
residual sugar = 4.117849533942754
chlorides = 5.14337824959792


In [281]:
# Function to generate a list of features which have skewness not in range -1 to 1
def skewed_list_fun(df):
    skewed_list = [name for name in df if df[name].skew() > 1 or df[name].skew() < -1]
    return skewed_list

In [282]:
skewed_list = skewed_list_fun(X_train)
print(skewed_list)

['fixed acidity', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'sulphates']


In [283]:
skew_calc(X_train[skewed_list])

Skewness of all features in given dataframe: 
fixed acidity = 1.0062538933982945
free sulfur dioxide = 1.3886754658770064
total sulfur dioxide = 1.4384967043965162
sulphates = 2.35648257662985
residual sugar = 4.117849533942754
chlorides = 5.14337824959792


In [284]:
# Initial log transformation to reduce skewness
for name in skewed_list:
    X_train[f"{name}_log"] = np.log1p(X_train[name])
    X_validation[f"{name}_log"] = np.log1p(X_validation[name])

In [285]:
# Skew calculation after log function, X_train
train_skewed_list = skewed_list_fun(X_train)
skew_calc(X_train[train_skewed_list])

Skewness of all features in given dataframe: 
fixed acidity = 1.0062538933982945
free sulfur dioxide = 1.3886754658770064
total sulfur dioxide = 1.4384967043965162
sulphates_log = 1.572284601796296
residual sugar_log = 2.0984027379861296
sulphates = 2.35648257662985
residual sugar = 4.117849533942754
chlorides_log = 4.674366660308375
chlorides = 5.14337824959792


In [286]:
# Skew calculation after log function, X_validation
valid_skewed_list = skewed_list_fun(X_validation)
skew_calc(X_validation[valid_skewed_list])

Skewness of all features in given dataframe: 
total sulfur dioxide = 1.2129466935375377
sulphates_log = 2.055768975593893
residual sugar_log = 2.0782371427498028
sulphates = 3.153049050487913
residual sugar = 4.121244495152995
chlorides_log = 7.066773403190184
chlorides = 8.109402367221042


In [287]:
X_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,fixed acidity_log,residual sugar_log,chlorides_log,free sulfur dioxide_log,total sulfur dioxide_log,sulphates_log
0,6.8,0.77,0.0,1.8,0.066,34.0,52.0,0.9976,3.62,0.68,9.9,2.054124,1.029619,0.063913,3.555348,3.970292,0.518794
1,7.3,0.58,0.3,2.4,0.074,15.0,55.0,0.9968,3.46,0.59,10.2,2.116256,1.223775,0.07139,2.772589,4.025352,0.463734
2,9.8,0.25,0.49,2.7,0.088,15.0,33.0,0.9982,3.42,0.9,10.0,2.379546,1.308333,0.084341,2.772589,3.526361,0.641854
3,11.4,0.625,0.66,6.2,0.088,6.0,24.0,0.9988,3.11,0.99,13.3,2.517696,1.974081,0.084341,1.94591,3.218876,0.688135
4,7.1,0.72,0.0,1.8,0.123,6.0,14.0,0.99627,3.45,0.58,9.8,2.091864,1.029619,0.116004,1.94591,2.70805,0.457425


In [288]:
X_validation.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,fixed acidity_log,residual sugar_log,chlorides_log,free sulfur dioxide_log,total sulfur dioxide_log,sulphates_log
0,8.4,0.635,0.36,2.0,0.089,15.0,55.0,0.99745,3.31,0.57,10.4,2.24071,1.098612,0.08526,2.772589,4.025352,0.451076
1,8.3,0.49,0.36,1.8,0.222,6.0,16.0,0.998,3.18,0.6,9.5,2.230014,1.029619,0.200489,1.94591,2.833213,0.470004
2,9.0,0.58,0.25,2.0,0.104,8.0,21.0,0.99769,3.27,0.72,9.6,2.302585,1.098612,0.09894,2.197225,3.091042,0.542324
3,8.1,0.78,0.1,3.3,0.09,4.0,13.0,0.99855,3.36,0.49,9.5,2.208274,1.458615,0.086178,1.609438,2.639057,0.398776
4,7.5,0.38,0.48,2.6,0.073,22.0,84.0,0.9972,3.32,0.7,9.6,2.140066,1.280934,0.070458,3.135494,4.442651,0.530628


In [289]:
skewed_list

['fixed acidity',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'sulphates']

In [290]:
# Removing list of previously skewed features and rename the dataframes
X_train_log = X_train.drop(skewed_list, axis = 1)
X_validation_log = X_validation.drop(skewed_list, axis = 1)

In [291]:
X_train_log.head()

Unnamed: 0,volatile acidity,citric acid,density,pH,alcohol,fixed acidity_log,residual sugar_log,chlorides_log,free sulfur dioxide_log,total sulfur dioxide_log,sulphates_log
0,0.77,0.0,0.9976,3.62,9.9,2.054124,1.029619,0.063913,3.555348,3.970292,0.518794
1,0.58,0.3,0.9968,3.46,10.2,2.116256,1.223775,0.07139,2.772589,4.025352,0.463734
2,0.25,0.49,0.9982,3.42,10.0,2.379546,1.308333,0.084341,2.772589,3.526361,0.641854
3,0.625,0.66,0.9988,3.11,13.3,2.517696,1.974081,0.084341,1.94591,3.218876,0.688135
4,0.72,0.0,0.99627,3.45,9.8,2.091864,1.029619,0.116004,1.94591,2.70805,0.457425


In [292]:
X_validation_log.head()

Unnamed: 0,volatile acidity,citric acid,density,pH,alcohol,fixed acidity_log,residual sugar_log,chlorides_log,free sulfur dioxide_log,total sulfur dioxide_log,sulphates_log
0,0.635,0.36,0.99745,3.31,10.4,2.24071,1.098612,0.08526,2.772589,4.025352,0.451076
1,0.49,0.36,0.998,3.18,9.5,2.230014,1.029619,0.200489,1.94591,2.833213,0.470004
2,0.58,0.25,0.99769,3.27,9.6,2.302585,1.098612,0.09894,2.197225,3.091042,0.542324
3,0.78,0.1,0.99855,3.36,9.5,2.208274,1.458615,0.086178,1.609438,2.639057,0.398776
4,0.38,0.48,0.9972,3.32,9.6,2.140066,1.280934,0.070458,3.135494,4.442651,0.530628


In [293]:
# After filtering log and non-log features of same name, ones with still high skew
print(f"Skew other than -1 to 1: {skewed_list_fun(X_train_log)}")
for name in skewed_list_fun(X_train_log):
    print(f"{name} : {X_train_log[name].skew()}")

Skew other than -1 to 1: ['residual sugar_log', 'chlorides_log', 'sulphates_log']
residual sugar_log : 2.0984027379861296
chlorides_log : 4.674366660308375
sulphates_log : 1.572284601796296


In [294]:
# After some EDA, reducing skewness using log 
linear_regression(X_train_log, y_train, X_validation_log, y_validation)

Using Linear Regression Model: 
MSE: 0.4202819677159197
RMSE: 0.6482915761568399
R2 Score: 0.35615951328301454


In [295]:
print(f"Skew crossing -1 to 1 range for X_train_log: {skewed_list_fun(X_train_log)}")
print(f"Skew crossing -1 to 1 range for X_validation_log: {skewed_list_fun(X_validation_log)}")

Skew crossing -1 to 1 range for X_train_log: ['residual sugar_log', 'chlorides_log', 'sulphates_log']
Skew crossing -1 to 1 range for X_validation_log: ['residual sugar_log', 'chlorides_log', 'sulphates_log']


In [296]:
print(f"Skew of chlorides_log: {X_train_log['chlorides_log'].skew()}")
X_train_log['chlorides_log'] = np.log1p(X_train_log['chlorides_log'])
print(f"Skew of chlorides_log after log function (again): {X_train_log['chlorides_log'].skew()}")
''' Value hasn't really changed so we use another method '''

Skew of chlorides_log: 4.674366660308375
Skew of chlorides_log after log function (again): 4.308872110013141


" Value hasn't really changed so we use another method "

In [297]:
from scipy import stats
skew_calc(X_train_log)

Skewness of all features in given dataframe: 
free sulfur dioxide_log = -0.08503794279569414
density = -0.0721368587006527
total sulfur dioxide_log = -0.06364767066361943
pH = 0.15605144433933577
citric acid = 0.29487725679585225
fixed acidity_log = 0.4647473972322844
volatile acidity = 0.7215297851296966
alcohol = 0.9518607069483428
sulphates_log = 1.572284601796296
residual sugar_log = 2.0984027379861296
chlorides_log = 4.308872110013141


In [298]:
# Box Cox only works for positives, for training - On features with extreme skew
X_train_log['chlorides_log'] = stats.boxcox(X_train_log['chlorides_log'])[0]
X_train_log['residual sugar_log'] = stats.boxcox(X_train_log['residual sugar_log'])[0]
X_train_log['sulphates_log'] = stats.boxcox(X_train_log['sulphates_log'])[0]

In [299]:
skew_calc(X_train_log) # Skewness of all features between -1 to 1 now

Skewness of all features in given dataframe: 
chlorides_log = -0.35271509901878
residual sugar_log = -0.12142250668345902
free sulfur dioxide_log = -0.08503794279569414
density = -0.0721368587006527
total sulfur dioxide_log = -0.06364767066361943
sulphates_log = -0.018920864587467414
pH = 0.15605144433933577
citric acid = 0.29487725679585225
fixed acidity_log = 0.4647473972322844
volatile acidity = 0.7215297851296966
alcohol = 0.9518607069483428


In [300]:
skew_calc(X_validation_log)

Skewness of all features in given dataframe: 
free sulfur dioxide_log = -0.12222308372900223
total sulfur dioxide_log = 0.02539383443247844
density = 0.08037243743089047
pH = 0.21089196121493933
citric acid = 0.43551943208901694
fixed acidity_log = 0.43992593673903413
volatile acidity = 0.4515967325978047
alcohol = 0.7804606875056638
sulphates_log = 2.055768975593893
residual sugar_log = 2.0782371427498028
chlorides_log = 7.066773403190184


In [301]:
# Box Cox only works for positives, for validation
X_validation_log['chlorides_log'] = stats.boxcox(X_validation_log['chlorides_log'])[0]
X_validation_log['residual sugar_log'] = stats.boxcox(X_validation_log['residual sugar_log'])[0]
X_validation_log['sulphates_log'] = stats.boxcox(X_validation_log['sulphates_log'])[0]

In [302]:
skew_calc(X_validation_log) # Skewness of all features between -1 to 1 now

Skewness of all features in given dataframe: 
chlorides_log = -0.2200941123170831
free sulfur dioxide_log = -0.12222308372900223
residual sugar_log = -0.056895803660988474
sulphates_log = -0.04697428770404776
total sulfur dioxide_log = 0.02539383443247844
density = 0.08037243743089047
pH = 0.21089196121493933
citric acid = 0.43551943208901694
fixed acidity_log = 0.43992593673903413
volatile acidity = 0.4515967325978047
alcohol = 0.7804606875056638


In [303]:
# After all skewed features are in range -1 to 1
linear_regression(X_train_log, y_train, X_validation_log, y_validation)

Using Linear Regression Model: 
MSE: 0.63909132888801
RMSE: 0.7994318788289656
R2 Score: 0.020959965320264562


In [304]:
''' Box plot after skewness was limited to -1 to 1 '''
# box_plot_all(X_train_log, "train")

' Box plot after skewness was limited to -1 to 1 '

In [305]:
''' Hist plot after skewness was limited to - 1 to 1 '''
# hist_plot_all(X_train_log, 'train') 

' Hist plot after skewness was limited to - 1 to 1 '

In [306]:
# Using log to decrease skew isn't really helping here
# Substituting the outliers with mean, median, other values