### Install pandas, numpy, matplotlib, scikit-learn 

In [None]:

%pip install pandas numpy matplotlib scikit-learn seaborn --quiet
print('-' * 60 + "\nAll libaries are installed")

### Import data and showing it of

In [None]:

import pandas as pd

#Read data csv file
df = pd.read_csv("../data/winequality-red.csv")

print("-" * 60 + "\nShow the first 5 rows:\n")
print(df.head())

print("-" * 60 + "\nShow rows and columns:\n(Rows, Columns)\n")
print(df.shape)

print("-" * 60 + "\nShow all the columns:\n")
df.columns

print("-" * 60 + "\nShow a description of the data:\n")
print(df.describe())

print("The first 11 columns are chemical molecules inside of wine which are:")
print("fixed acidity = stable acids in wine that do not evaporate and contribute to the wine's taste and structure\n" \
"volatile acidity = gaseous acids, primarily acetic acid, which can give wine a vinegar-like smell\n" \
"citric acid = a weak organic acid found in small amounts in wine grapes, playing a role in the fermentation process\n" \
"residual sugar = natural sugars left in the wine after fermentation has completed\n" \
"chlorides = chloride ions\n" \
"free sulfur dioxide = sulfur dioxide that is available to protect the wine from oxidation and microbial spoilage\n" \
"total sulfur dioxide," \
"density," \
"pH," \
"sulphates," \
"alcohol")
print("\nThen all of the wine was graded with a quality in the 12th column which is a score between 0 - 10")

------------------------------------------------------------
Show the first 5 rows:

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

 

### Check for NaN values


In [None]:
check_nans=df.isna().sum()
print(check_nans)

### Find and delete duplicates


In [None]:
print("-" * 60 + "\nLooking for duplicates:\n")
dups=df.duplicated()
print(dups)

print("-" * 60 + "\nDeleting duplicates:")
df=df.drop_duplicates()
print("\nDone!")

### Finding outliers and deleting them


In [None]:
print("-" * 60 + "\nFinding and deleting outliers:\n")
print(df.size)
for col in df.columns:
    Q1 = df[f"{col}"].quantile(0.25)
    Q3 = df[f"{col}"].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - (1.5 * IQR)
    upper_bound = Q3 + (1.5 * IQR)

    # print(f"Debug:\ncol: {col} --- IQR: {IQR} --- low: {lower_bound} --- high: {upper_bound}")

    df = df[(df[f"{col}"] >= lower_bound) & (df[f"{col}"] <= upper_bound)]

print("Done!\n" + "-" * 60 + "\nAfter outlier filtering:\n")
print(df.size)

### Show data with bar chart

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator, AutoLocator

def set_granularity(col, deg, dataFrame, ax):
    dmin, dmax = dataFrame[col].min(), dataFrame[col].max()
    yrange = dmax - dmin
    if yrange < 5 * (1/deg):
        if yrange < 5 * (1/deg):
            tick_spacing = 1 * (1/deg)
        elif yrange < 2 * (1/deg):
            tick_spacing = 2 * (1/deg)
        else:
            tick_spacing = 5 * (1/deg)
        ax.yaxis.set_major_locator(MultipleLocator(tick_spacing))
        padding = tick_spacing * 1.5
    else:
        ax.yaxis.set_major_locator(AutoLocator())
        padding = (dmax - dmin) * 0.5
    ax.set_ylim(dmin - padding, dmax + padding)

In [None]:
def bar_subplots(dataFrame, columns):
    n = len(columns)
    fig, axes = plt.subplots(n, 1, figsize=(8, 3*n))

    for ax, col in zip(axes, columns):
        ax.bar(dataFrame.index, dataFrame[col])
        ax.set_title(col)
        ax.set_xlabel("Sample nr.")
        ax.set_ylabel(f"Amount of {col}")
        if col == "density":
            set_granularity(col, 1000, dataFrame, ax)
        elif col == "pH":
            set_granularity(col, 1000, dataFrame, ax)

    plt.tight_layout(h_pad=2.0)
    plt.show()

bar_subplots(df, df.columns)


### Show data with histogram chart


In [None]:
def hist_subplots(df, columns):
    n = len(columns)
    fig, axes = plt.subplots(n, 1, figsize=(8, 3*n))
    colors = ['red','blue','green','orange','purple','brown','orange','gray','cyan','magenta','yellow','black']
    for ax, col,color in zip(axes, columns,colors):
        ax.hist(df[col],bins=7, edgecolor='black', color=color, alpha=0.7)
        ax.set_title(col,color=color)
        ax.set_ylabel("Total amount of sample")
        ax.set_xlabel(f"Amount of {col}")
        plt.tight_layout()
    plt.show()

hist_subplots(df, df.columns)


### Good quality wine data graphs

In [None]:
good_quality_wine = df[(df['quality'] >= 6.5)]
good_quality_wine.reset_index(drop=True, inplace=True)

bar_subplots(good_quality_wine, (good_quality_wine.loc[:, good_quality_wine.columns != 'quality']).columns)
hist_subplots(good_quality_wine, (good_quality_wine.loc[:, good_quality_wine.columns != 'quality']).columns)



### Machine learning

In [None]:
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

import seaborn as sns

print("sklearn libary are now imported!")

### Classifing

In [None]:
# Create a category:
# 1 = high quality
# 0 = low quality
df_cls = df.copy()
good_quality_threshold = 6.5
df_cls["good_quality"] = (df["quality"] >= good_quality_threshold).astype(int)
print("-" * 60 + f"\nClassifing the good quality wine above {good_quality_threshold}, so they are either 1 (high) or 0 (low)\n")
print(df_cls[["quality", "good_quality"]].head(20))

In [None]:
# Get all columns execpt quality
x_clf =df[(df.loc[:,df.columns != 'quality']).columns]
y_clf = df_cls[['good_quality']]

X_train, X_test, Y_train, Y_test = train_test_split(
    x_clf, y_clf, test_size=0.2, random_state=42)

### Logical regression

In [None]:
print("-" * 60 + "\nMaking and traing a ML with the model of Logictic regression")
log_reg = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    solver="liblinear"   
)

# Training the ML on the training data
log_reg.fit(X_train, Y_train)

# Testing the ML against the test data
y_pred_log = log_reg.predict(X_test)

precision = precision_score(y_true=Y_test, y_pred=y_pred_log, average='micro')
recall = recall_score(y_true=Y_test, y_pred=y_pred_log, average='micro')

print("\nResults:\n" + '-' * 20)
print(f"Accuracy: {accuracy_score(Y_test, y_pred_log):.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}\n")

In [None]:
cm_log = confusion_matrix(Y_test, y_pred_log)

sns.heatmap(cm_log, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix – Logistic Regression")
plt.xlabel("Prediction")
plt.ylabel("True value")
plt.show()

### Random forest classifier

In [None]:
print("-" * 60 + "\nMaking and traing a ML with the model of Random Forest Classifier")
rf_clf = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)
rf_clf.fit(X_train, Y_train)

y_pred_rf_clf = rf_clf.predict(X_test)

precision = precision_score(y_true=Y_test, y_pred=y_pred_rf_clf,average='micro')
recall = recall_score(y_true=Y_test, y_pred=y_pred_rf_clf, average='micro')

print("\nResults:\n" + "-" * 20)
print(f"Accuracy: {accuracy_score(Y_test, y_pred_rf_clf):.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")

In [None]:
cm_rf = confusion_matrix(Y_test, y_pred_rf_clf)

sns.heatmap(cm_rf, annot=True, fmt="d", cmap="Greens")
plt.title("Confusion Matrix – Random Forest")
plt.xlabel("Prediction")
plt.ylabel("True value")
plt.show()

In [None]:
importances = rf_clf.feature_importances_
features = x_clf.columns

df_importance = pd.DataFrame({'Feature': features, 'Importance': importances})
print(df_importance.sort_values('Importance', ascending=False))

### Recomendation

In [None]:
for col in good_quality_wine.columns:
    meeean = good_quality_wine.head()[f"{col}"].mean()
    stttd = good_quality_wine.head()[f"{col}"].std()
    print(f"{col}: {meeean:.2f} +- {stttd:.2f}")