In [41]:
# Import necessary libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import confusion_matrix, mean_squared_error, r2_score, root_mean_squared_error, accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import plot_tree
from scipy import stats

In [42]:
# Load train and validation datasets
df_train = pd.read_csv("data_files/train.csv")
df_validation = pd.read_csv("data_files/validation.csv")

In [43]:
def knn_reg(X_train, y_train, X_validation, y_validation):
    # Defining the model
    model = KNeighborsRegressor(n_neighbors=12, weights='distance', algorithm='auto', p=1, metric='minkowski')

    # Fitting the model
    model.fit(X_train, y_train)

    # Predicting
    y_pred = model.predict(X_validation)

    print(f"Using KNN Regressor Model: ")

    # Evaluation - MSE, RMSE
    print(f"MSE: {mean_squared_error(y_validation, y_pred)}")
    print(f"RMSE: {root_mean_squared_error(y_validation, y_pred)}")
 
    # R2 Score
    print(f"R2 Score: {r2_score(y_validation, y_pred)}")

In [44]:
# Not wrangling or EDA
# print("Without any data wrangling or EDA:")
# knn_reg(X_train, y_train, X_validation, y_validation)

In [45]:
# Function to replace outliers with median
def replace_with_median(df, name):
    Q1 = df[name].quantile(0.25)
    Q3 = df[name].quantile(0.75)
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    name_median = df[name].median()

    df[name] = np.where((df[name] < lower_limit) | (df[name] > upper_limit), name_median, df[name])
    return df

In [46]:
# High no. of outliers
df_train_to_check = ['fixed acidity', 'volatile acidity', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates']
df_validation_to_check = ['residual sugar', 'chlorides', 'density', 'sulphates']

In [47]:
def skew_calc(df):
    skew_vals_rev = {df[name].skew():name for name in df}
    skew_vals_rev_sorted = dict(sorted(skew_vals_rev.items()))
    skew_vals = {value:key for key, value in skew_vals_rev_sorted.items()}
    print(f"Skewness of all features in given dataframe: ")
    for key, value in skew_vals.items():
        print(f"{key} = {value}")

In [48]:
def calculate_z_scores(df, column):
    mean_col = df[column].mean()
    std_dev = df[column].std()
    z_scores = ((df[column] - mean_col) / std_dev).abs()
    return z_scores

In [49]:
skew_calc(df_train)

Skewness of all features in given dataframe: 
density = -0.0721368587006527
quality = 0.0656214905859264
pH = 0.15605144433933577
citric acid = 0.29487725679585225
volatile acidity = 0.7215297851296966
alcohol = 0.9518607069483428
fixed acidity = 1.0062538933982945
free sulfur dioxide = 1.3886754658770064
total sulfur dioxide = 1.4384967043965162
sulphates = 2.35648257662985
residual sugar = 4.117849533942754
chlorides = 5.14337824959792


In [50]:
skew_calc(df_validation)

Skewness of all features in given dataframe: 
density = 0.08037243743089047
pH = 0.21089196121493933
citric acid = 0.43551943208901694
volatile acidity = 0.4515967325978047
quality = 0.4596632384305239
alcohol = 0.7804606875056638
free sulfur dioxide = 0.9038166574998514
fixed acidity = 0.930965965089329
total sulfur dioxide = 1.2129466935375377
sulphates = 3.153049050487913
residual sugar = 4.121244495152995
chlorides = 8.109402367221042


In [51]:
to_log_train = ['fixed acidity', 'free sulfur dioxide', 'total sulfur dioxide', 'sulphates', 'residual sugar', 'chlorides']
to_log_valid = ['total sulfur dioxide', 'sulphates', 'residual sugar', 'chlorides']

In [52]:
for name in to_log_train:
    df_train[name] = np.log1p(df_train[name])
for name in to_log_valid:
    df_validation[name] = np.log1p(df_validation[name])

In [53]:
skew_calc(df_train)

Skewness of all features in given dataframe: 
free sulfur dioxide = -0.08503794279569414
density = -0.0721368587006527
total sulfur dioxide = -0.06364767066361943
quality = 0.0656214905859264
pH = 0.15605144433933577
citric acid = 0.29487725679585225
fixed acidity = 0.4647473972322844
volatile acidity = 0.7215297851296966
alcohol = 0.9518607069483428
sulphates = 1.572284601796296
residual sugar = 2.0984027379861296
chlorides = 4.674366660308375


In [54]:
skew_calc(df_validation)

Skewness of all features in given dataframe: 
total sulfur dioxide = 0.02539383443247844
density = 0.08037243743089047
pH = 0.21089196121493933
citric acid = 0.43551943208901694
volatile acidity = 0.4515967325978047
quality = 0.4596632384305239
alcohol = 0.7804606875056638
free sulfur dioxide = 0.9038166574998514
fixed acidity = 0.930965965089329
sulphates = 2.055768975593893
residual sugar = 2.0782371427498028
chlorides = 7.066773403190184


In [55]:
# Use box cox 
to_boxcox_train = ['sulphates', 'residual sugar', 'chlorides']
to_boxcox_valid = ['sulphates', 'residual sugar', 'chlorides']
''' for name in to_boxcox_train:
    X_train[name] = stats.boxcox(X_train[name])[0]
for name in to_boxcox_valid:
    X_validation[name] = stats.boxcox(X_validation[name])[0] '''

' for name in to_boxcox_train:\n    X_train[name] = stats.boxcox(X_train[name])[0]\nfor name in to_boxcox_valid:\n    X_validation[name] = stats.boxcox(X_validation[name])[0] '

In [56]:
threshold = 2
filtered_train = df_train.copy()
for column in to_boxcox_train:
    z_scores = calculate_z_scores(filtered_train, column)
    filtered_train = filtered_train[z_scores < threshold]
    
df_train = filtered_train
skew_calc(df_train)

Skewness of all features in given dataframe: 
free sulfur dioxide = -0.15973714777546028
total sulfur dioxide = -0.10501003295812325
density = -0.0134444930001619
quality = 0.09715628032586926
chlorides = 0.24017030635684658
pH = 0.25159999766302715
citric acid = 0.3448131344419056
sulphates = 0.44080597396456334
fixed acidity = 0.4579953319597294
volatile acidity = 0.7715254514158074
residual sugar = 0.8859488931793944
alcohol = 0.904246782392275


In [57]:
threshold = 2
for column in to_boxcox_train:
    z_scores = calculate_z_scores(df_validation, column)
    df_validation = df_validation[z_scores < threshold]

skew_calc(df_validation)

Skewness of all features in given dataframe: 
density = 0.029668236847751925
total sulfur dioxide = 0.031609031026596845
chlorides = 0.30719482360316397
residual sugar = 0.3496173007248085
pH = 0.384507225535403
citric acid = 0.38991841799625626
volatile acidity = 0.41699044393676876
quality = 0.5065230710999892
sulphates = 0.5403217973817979
alcohol = 0.8703309057690954
free sulfur dioxide = 0.937955290510649
fixed acidity = 0.9422408734808283


In [58]:
skew_calc(df_train)

Skewness of all features in given dataframe: 
free sulfur dioxide = -0.15973714777546028
total sulfur dioxide = -0.10501003295812325
density = -0.0134444930001619
quality = 0.09715628032586926
chlorides = 0.24017030635684658
pH = 0.25159999766302715
citric acid = 0.3448131344419056
sulphates = 0.44080597396456334
fixed acidity = 0.4579953319597294
volatile acidity = 0.7715254514158074
residual sugar = 0.8859488931793944
alcohol = 0.904246782392275


In [59]:
skew_calc(df_validation)

Skewness of all features in given dataframe: 
density = 0.029668236847751925
total sulfur dioxide = 0.031609031026596845
chlorides = 0.30719482360316397
residual sugar = 0.3496173007248085
pH = 0.384507225535403
citric acid = 0.38991841799625626
volatile acidity = 0.41699044393676876
quality = 0.5065230710999892
sulphates = 0.5403217973817979
alcohol = 0.8703309057690954
free sulfur dioxide = 0.937955290510649
fixed acidity = 0.9422408734808283


In [60]:
# Replace outliers with median
for name in df_train_to_check:
    df_train = replace_with_median(df_train, name)

for name in df_validation_to_check:
    df_validation_ = replace_with_median(df_validation, name)

In [61]:
skew_calc(df_train)

Skewness of all features in given dataframe: 
total sulfur dioxide = -0.10501003295812325
free sulfur dioxide = -0.09981536238868703
chlorides = 0.007386370134731044
density = 0.025886775731184843
pH = 0.07958402403740351
quality = 0.09715628032586926
residual sugar = 0.1705413015737153
volatile acidity = 0.2571422861521256
citric acid = 0.3448131344419056
fixed acidity = 0.3973990709744102
sulphates = 0.41541212925968224
alcohol = 0.904246782392275


In [62]:
skew_calc(df_validation)

Skewness of all features in given dataframe: 
residual sugar = -0.05027517631439294
total sulfur dioxide = 0.031609031026596845
density = 0.05412610674559419
chlorides = 0.07833985493316707
pH = 0.384507225535403
citric acid = 0.38991841799625626
volatile acidity = 0.41699044393676876
quality = 0.5065230710999892
sulphates = 0.5202279085105761
alcohol = 0.8703309057690954
free sulfur dioxide = 0.937955290510649
fixed acidity = 0.9422408734808283


In [63]:
# Dividing training data into input features and target features
X_train = df_train.drop('quality', axis = 1)
y_train = df_train['quality']

# Dividing validation data into input features and target features
X_validation = df_validation.drop('quality', axis = 1)
y_validation = df_validation['quality']

In [64]:
knn_reg(X_train, y_train, X_validation, y_validation)

Using KNN Regressor Model: 
MSE: 0.45993961959449514
RMSE: 0.6781884838262112
R2 Score: 0.27617307165462324
