In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from math import sqrt, log

# Define RMSLE (Root Mean Squared Logarithmic Error)
def rmsle(y_true, y_pred):
    return sqrt(mean_squared_error(np.log1p(y_true), np.log1p(y_pred)))

# Load the dataset into a DataFrame
file_path = "winequality-white.csv"  # Replace with the correct file path
df = pd.read_csv(C:\atish\JN)

# Print column names to check for discrepancies
print("Columns in the dataset:")
print(df.columns)

# Strip whitespace from column names (if any)
df.columns = df.columns.str.strip()

# Verify column names after cleaning
print("\nCleaned Columns in the dataset:")
print(df.columns)

# 2. Check for missing values and drop them
print("\nMissing Values in Data:")
print(df.isnull().sum())

# Drop rows with missing values
df = df.dropna()

# 3. Check data types for all features
print("\nData Types of Features:")
print(df.dtypes)

# 4. Extract dependent and independent variables
if "alcohol" in df.columns:
    X = df.drop(columns=["alcohol"])  # All columns except 'alcohol'
    y = df["alcohol"]                 # 'alcohol' is the dependent feature
else:
    raise KeyError("The 'alcohol' column is missing from the dataset. Check column names!")

# 5. Split the data into train and test sets (20% test size)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Create a DataFrame to store model performance
results = pd.DataFrame(columns=["Model", "RMSE", "MAPE", "RMSLE"])

# Helper function to evaluate and store results
def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Predict on test data
    
    # Calculate metrics
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    mape = mean_absolute_percentage_error(y_test, y_pred)
    rmsle_val = rmsle(y_test, y_pred)
    
    # Append results to the DataFrame
    results.loc[len(results)] = [name, rmse, mape, rmsle_val]

# 7. Build models and evaluate performance
# Linear Regression
lin_reg = LinearRegression()
evaluate_model("Linear Regression", lin_reg, X_train, y_train, X_test, y_test)

# Support Vector Machine (SVM)
svm = SVR()
evaluate_model("SVM", svm, X_train, y_train, X_test, y_test)

# Ridge Regression
ridge = Ridge(alpha=1.0)
evaluate_model("Ridge Regression", ridge, X_train, y_train, X_test, y_test)

# Lasso Regression
lasso = Lasso(alpha=0.1)
evaluate_model("Lasso Regression", lasso, X_train, y_train, X_test, y_test)

# Decision Tree Regressor
dtree = DecisionTreeRegressor(random_state=42)
evaluate_model("Decision Tree", dtree, X_train, y_train, X_test, y_test)

# Display the final comparison DataFrame
print("\nModel Performance Comparison:")
print(results)


Columns in the dataset:
Index(['fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"'], dtype='object')

Cleaned Columns in the dataset:
Index(['fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"'], dtype='object')

Missing Values in Data:
fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"    0
dtype: int64

Data Types of Features:
fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"    object
dtype: object


KeyError: "The 'alcohol' column is missing from the dataset. Check column names!"