In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

def load_and_scale_data(path):
    df = pd.read_csv(path)
    X = df.drop("quality", axis=1)
    y = df["quality"]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    return df, X_scaled, y, scaler

In [2]:
def engineer_features(df):
    df["sulfur_ratio"] = df["free sulfur dioxide"] / (df["total sulfur dioxide"] + 1)
    df["acid_balance"] = df["fixed acidity"] / (df["volatile acidity"] + 0.01)
    return df

In [3]:
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score

def find_best_k(X):
    scores = {}
    for k in range(2, 11):
        labels = KMeans(n_clusters=k, random_state=42).fit_predict(X)
        scores[k] = silhouette_score(X, labels)
    return scores

def train_clustering(X, k=5):
    model = AgglomerativeClustering(n_clusters=k)
    labels = model.fit_predict(X)
    return model, labels

In [4]:
import pandas as pd

def cluster_quality_analysis(y, labels):
    df = pd.DataFrame({
        "quality": y,
        "cluster": labels
    })
    return pd.crosstab(df["quality"], df["cluster"])

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

def train_quality_model(X, clusters, y):
    X_augmented = X.copy()
    X_augmented["cluster"] = clusters

    X_train, X_test, y_train, y_test = train_test_split(
        X_augmented, y, test_size=0.2, random_state=42
    )

    model = RandomForestClassifier(n_estimators=200, random_state=42)
    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    return classification_report(y_test, preds), model

In [6]:
from src.data_preprocessing import load_and_scale_data
from src.feature_engineering import engineer_features
from src.clustering import find_best_k, train_clustering
from src.evaluation import cluster_quality_analysis
from src.supervised_model import train_quality_model
import pandas as pd

df, X_scaled, y, scaler = load_and_scale_data("data/winequality-red.csv")

df = engineer_features(df)

# Clustering optimization
scores = find_best_k(X_scaled)
best_k = max(scores, key=scores.get)

cluster_model, clusters = train_clustering(X_scaled, best_k)

print("Cluster vs Quality")
print(cluster_quality_analysis(y, clusters))

df["cluster"] = clusters

report, model = train_quality_model(df.drop("quality", axis=1), clusters, y)
print(report)

ModuleNotFoundError: No module named 'src'