# Task 1: Dataset Understanding

## Loading the dataset

In [None]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
wine_quality = fetch_ucirepo(id=186)

# data (as pandas dataframes)
X = wine_quality.data.features
y = wine_quality.data.targets

# metadata
print(wine_quality.metadata)

# variable information
print(wine_quality.variables)

{'uci_id': 186, 'name': 'Wine Quality', 'repository_url': 'https://archive.ics.uci.edu/dataset/186/wine+quality', 'data_url': 'https://archive.ics.uci.edu/static/public/186/data.csv', 'abstract': 'Two datasets are included, related to red and white vinho verde wine samples, from the north of Portugal. The goal is to model wine quality based on physicochemical tests (see [Cortez et al., 2009], http://www3.dsi.uminho.pt/pcortez/wine/).', 'area': 'Business', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Multivariate'], 'num_instances': 4898, 'num_features': 11, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['quality'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2009, 'last_updated': 'Wed Nov 15 2023', 'dataset_doi': '10.24432/C56S3T', 'creators': ['Paulo Cortez', 'A. Cerdeira', 'F. Almeida', 'T. Matos', 'J. Reis'], 'intro_paper': {'ID': 252, 'type': 'NATIVE', 'title': 'Modeling wine preferences

In [None]:
X.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [None]:
y.head()

Unnamed: 0,quality
0,5
1,5
2,5
3,6
4,5


## Number of samples

In [None]:
print(len(X))

6497


## Number of features

In [None]:
print(len(X.columns))
list(X.columns)

11


['fixed_acidity',
 'volatile_acidity',
 'citric_acid',
 'residual_sugar',
 'chlorides',
 'free_sulfur_dioxide',
 'total_sulfur_dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol']

## Target variable

In [None]:
list(y.columns)

['quality']

## Check for missing or inconsistent values

In [None]:
X.isnull().sum()

Unnamed: 0,0
fixed_acidity,0
volatile_acidity,0
citric_acid,0
residual_sugar,0
chlorides,0
free_sulfur_dioxide,0
total_sulfur_dioxide,0
density,0
pH,0
sulphates,0


In [None]:
X.isna().sum()

Unnamed: 0,0
fixed_acidity,0
volatile_acidity,0
citric_acid,0
residual_sugar,0
chlorides,0
free_sulfur_dioxide,0
total_sulfur_dioxide,0
density,0
pH,0
sulphates,0


In [None]:
y.isnull().sum()

Unnamed: 0,0
quality,0


In [None]:
y.isna().sum()

Unnamed: 0,0
quality,0


## Briefly summarize the dataset characteristics.


The Dataset contains **11 Features** and **1 Target** columns. The **feature columns** contain **float values** and **target column** contain **integer value**. There is **no missing data** and no instance of NULL or NA in the entire dataframe.

The size of the whole dataset is **6497 rows**

In [None]:
print(wine_quality.variables)

                    name     role         type demographic  \
0          fixed_acidity  Feature   Continuous        None   
1       volatile_acidity  Feature   Continuous        None   
2            citric_acid  Feature   Continuous        None   
3         residual_sugar  Feature   Continuous        None   
4              chlorides  Feature   Continuous        None   
5    free_sulfur_dioxide  Feature   Continuous        None   
6   total_sulfur_dioxide  Feature   Continuous        None   
7                density  Feature   Continuous        None   
8                     pH  Feature   Continuous        None   
9              sulphates  Feature   Continuous        None   
10               alcohol  Feature   Continuous        None   
11               quality   Target      Integer        None   
12                 color    Other  Categorical        None   

               description units missing_values  
0                     None  None             no  
1                     None  Non

# Task 2: Model Selection

## Linear Regression Models

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import linear_model

clf = linear_model.Lasso(alpha=0.1) # lasso model regularized linearRegression model

## Random Forest Regressor Model

In [None]:
from sklearn.ensemble import RandomForestRegressor

regr1 = RandomForestRegressor(n_estimators=50,max_depth=10, random_state=0)
regr2 = RandomForestRegressor(n_estimators=100,max_depth=15, random_state=0)  # as per the experiment table

# Task 3: Experiment Design

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

## Experiment 01 - Linear Regression Model
- Hyperparameters : Default
- Preprocessing Steps : None
- Feature Selection Method : All features
- Train-Test Split : 80/20

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
r2_score_value = model.score(X_test, y_test)
print(f"R^2 Score: {r2_score_value:.2f}")

R^2 Score: 0.26


In [None]:
y_pred = model.predict(X_test)

mse_value = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse_value:.2f}")

Mean Squared Error (MSE): 0.55


## Experiment 02 - Linear Regression Model
- Hyperparameters : Regularization enabled
- Preprocessing Steps : Standardization
- Feature Selection Method : Correlation based
- Train-Test Split : 80/20

### Preprocessing

#### Feature Correlation

In [None]:
X_t = X.copy()
X_t['quantity'] = y
X_t

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quantity
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [None]:
corr_matrix = X_t.corr()
corr_with_target = corr_matrix['quantity'].sort_values(ascending=False)
corr_with_target

Unnamed: 0,quantity
quantity,1.0
alcohol,0.444319
citric_acid,0.085532
free_sulfur_dioxide,0.055463
sulphates,0.038485
pH,0.019506
residual_sugar,-0.03698
total_sulfur_dioxide,-0.041385
fixed_acidity,-0.076743
chlorides,-0.200666


In [None]:
X_t = X_t.drop(columns=['density','volatile_acidity','chlorides','fixed_acidity','total_sulfur_dioxide'])
X_t

Unnamed: 0,citric_acid,residual_sugar,free_sulfur_dioxide,pH,sulphates,alcohol,quantity
0,0.00,1.9,11.0,3.51,0.56,9.4,5
1,0.00,2.6,25.0,3.20,0.68,9.8,5
2,0.04,2.3,15.0,3.26,0.65,9.8,5
3,0.56,1.9,17.0,3.16,0.58,9.8,6
4,0.00,1.9,11.0,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...
6492,0.29,1.6,24.0,3.27,0.50,11.2,6
6493,0.36,8.0,57.0,3.15,0.46,9.6,5
6494,0.19,1.2,30.0,2.99,0.46,9.4,6
6495,0.30,1.1,20.0,3.34,0.38,12.8,7


In [None]:
y_t = X_t['quantity']
X_t = X_t.drop(columns=['quantity'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_t, y_t, test_size=0.2, random_state=42)

#### Standadiztion

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

In [None]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
model2 = linear_model.Lasso(alpha=0.1)
model2.fit(X_train_scaled,y_train)

In [None]:
r2_score_value = model2.score(X_test_scaled, y_test)
print(f"R^2 Score: {r2_score_value:.2f}")

R^2 Score: 0.18


In [None]:
y_pred = model2.predict(X_test_scaled)

mse_value = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse_value:.2f}")

Mean Squared Error (MSE): 0.61


## Experiment 03 - Random Forest Model
- Hyperparameters : 50 trees, depth = 10
- Preprocessing Steps : None
- Feature Selection Method : All Features
- Train-Test Split : 80/20

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
regr1 = RandomForestRegressor(n_estimators=50,max_depth=10, random_state=0)

In [None]:
regr1.fit(X_train,y_train)

  return fit_method(estimator, *args, **kwargs)


In [None]:
r2_score_value = regr1.score(X_test, y_test)
print(f"R^2 Score: {r2_score_value:.2f}")

R^2 Score: 0.42


In [None]:
y_pred = regr1.predict(X_test)

mse_value = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse_value:.2f}")

Mean Squared Error (MSE): 0.43


## Experiment 04 - Random Forest Model
- Hyperparameters : 100 trees, depth = 15
- Preprocessing Steps : None
- Feature Selection Method : Selected features
- Train-Test Split : 80/20