# TASK 8:

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut, cross_val_score, StratifiedShuffleSplit
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load dataset
file_path = "winequality-white.csv"  # Update path if needed
df = pd.read_csv(file_path, delimiter=";")

# Define features and target variable
X = df.drop(columns=["quality"])
y = df["quality"]

# Task 8: Validation Techniques

# 1. Holdout Validation (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rmse_holdout = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE - Holdout Validation: {rmse_holdout:.3f}")

# 2. K-Fold Cross-Validation (k=5)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_kfold = np.mean(np.sqrt(-cross_val_score(model, X, y, cv=kf, scoring="neg_mean_squared_error")))
print(f"RMSE - K-Fold (k=5): {rmse_kfold:.3f}")

# 3. Leave-One-Out Cross-Validation (LOOCV)
loo = LeaveOneOut()
rmse_loocv = np.mean(np.sqrt(-cross_val_score(model, X, y, cv=loo, scoring="neg_mean_squared_error")))
print(f"RMSE - LOOCV: {rmse_loocv:.3f}")

# Task 9: Stratified Sampling
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in sss.split(X, y):
    X_train_strat, X_test_strat = X.iloc[train_index], X.iloc[test_index]
    y_train_strat, y_test_strat = y.iloc[train_index], y.iloc[test_index]

# Check class distribution
original_distribution = y.value_counts(normalize=True)
train_distribution = y_train_strat.value_counts(normalize=True)
test_distribution = y_test_strat.value_counts(normalize=True)
print("Original class distribution:")
print(original_distribution)
print("\nTraining set class distribution:")
print(train_distribution)
print("\nTest set class distribution:")
print(test_distribution)

# Task 10: Handling Categorical and Text Attributes
# (No categorical/text attributes in dataset, but general approach is outlined)

print("\nNo categorical or text attributes detected. If present, consider:")
print("- Label Encoding for ordinal categories")
print("- One-Hot Encoding for nominal categories") 
print("- TF-IDF or Word Embeddings for text processing")


RMSE - Holdout Validation: 0.754
RMSE - K-Fold (k=5): 0.754
RMSE - LOOCV: 0.585
Original class distribution:
quality
6    0.448755
5    0.297468
7    0.179665
8    0.035729
4    0.033279
3    0.004083
9    0.001021
Name: proportion, dtype: float64

Training set class distribution:
quality
6    0.448698
5    0.297601
7    0.179684
8    0.035733
4    0.033180
3    0.004084
9    0.001021
Name: proportion, dtype: float64

Test set class distribution:
quality
6    0.448980
5    0.296939
7    0.179592
8    0.035714
4    0.033673
3    0.004082
9    0.001020
Name: proportion, dtype: float64

No categorical or text attributes detected. If present, consider:
- Label Encoding for ordinal categories
- One-Hot Encoding for nominal categories
- TF-IDF or Word Embeddings for text processing
