In [None]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, accuracy_score, r2_score, mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('WineQT.csv')

In [None]:
df.sample(10)

In [None]:
df.info()

In [None]:
# Check correlation between numerical features
plt.figure(figsize=(15, 10))
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.show()

In [None]:
X = df.drop(columns=['quality'])
y = df['quality']

In [None]:
X_train,X_test,y_train, y_test= train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train,y_train)

In [None]:
y_pred = rf.predict(X_test) 

In [None]:
print('MSE:',mean_squared_error(y_test,y_pred))
print('R2 score:',r2_score(y_test,y_pred))

In [None]:
rf_parameters = RandomForestRegressor( 
    n_estimators=100,                  # Number of trees in the forest
    criterion='squared_error',          # Function to measure the quality of a split (mean squared error for regression)
    max_depth=None,                    # Maximum depth of the tree
    min_samples_split=2,               # Minimum number of samples required to split an internal node
    min_samples_leaf=1,                # Minimum number of samples required to be at a leaf node
    min_weight_fraction_leaf=0.0,      # Minimum weighted fraction of the sum total of weights required to be at a leaf node
    max_features="sqrt",               # Number of features to consider when looking for the best split
    max_leaf_nodes=None,               # Grow trees with max_leaf_nodes in best-first fashion
    min_impurity_decrease=0.0,         # A node will be split if this split induces a decrease of the impurity greater than or equal to this value
    bootstrap=True,                    # Whether bootstrap samples are used when building trees
    oob_score=False,                   # Whether to use out-of-bag samples to estimate the generalization accuracy
    n_jobs=None,                       # Number of jobs to run in parallel for both fit and predict
    random_state=42,                   # Seed used by the random number generator
    verbose=0,                         # Controls the verbosity when fitting and predicting
    warm_start=False,                  # When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble
    ccp_alpha=0.0                      # Complexity parameter used for Minimal Cost-Complexity Pruning
)

rf_parameters.fit(X_train, y_train)

In [None]:
y_pred = rf_parameters.predict(X_test)  

In [None]:
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))  

In [None]:
importances = rf.feature_importances_

In [None]:
feature_names = X_train.columns  
importances_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

  
print(importances_df)

In [None]:
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=importances_df)
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature') 
plt.show() 