In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Step 1: Load the dataset
wine = load_wine()
df = pd.DataFrame(wine.data, columns=wine.feature_names)
df['target'] = wine.target

In [None]:
# Step 2: Initial EDA
df.head()
df.tail()
df.isnull().sum()
df.dtypes


In [None]:
# : Split data into train-test
X = df.drop(columns=['target'])
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

In [None]:
#Step 3 : Initial Model Training
models = {
    'Logistic Regression': LogisticRegression(max_iter=200),
    'Decision Tree': DecisionTreeClassifier(random_state=3),
    'Random Forest': RandomForestClassifier(random_state=3)
}

In [None]:
print("Initial Model Results:")
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n{name}:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    #print(confusion_matrix(y_test, y_pred))

#error due to non scaled data התוצאה חלקית

In [None]:
# Step 4: Add Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
print("\nModel Results After Scaling:")
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    print(f"\n{name}:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
   
   # print(confusion_matrix(y_test, y_pred))

In [None]:
# Step 5: Handle Outliers in Residual Sugar (example column)
q1 = df['ash'].quantile(0.25)
q3 = df['ash'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
df_outliers_removed = df[(df['ash'] >= lower_bound) & (df['ash'] <= upper_bound)]

In [None]:
X = df_outliers_removed.drop(columns=['target'])
y = df_outliers_removed['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
print("\nModel Results After Handling Outliers:")
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    print(f"\n{name}:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

In [None]:
# Step 8: Fine-tuning Parameters
print("\nFine-tuning Logistic Regression:")
log_reg = LogisticRegression(max_iter=500)
log_reg.fit(X_train_scaled, y_train)
y_pred = log_reg.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

In [None]:
print("\nFine-tuning Decision Tree:")
dec_tree = DecisionTreeClassifier(max_depth=3, random_state=3)
dec_tree.fit(X_train, y_train)
y_pred = dec_tree.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

In [None]:
print("\nFine-tuning Random Forest:")
rand_forest = RandomForestClassifier(n_estimators=308, random_state=3)
rand_forest.fit(X_train, y_train)
y_pred = rand_forest.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

In [None]:
# Step 9: Confusion Matrix Heatmap for Logistic Regression
cm = confusion_matrix(y_test, log_reg.predict(X_test_scaled))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Logistic Regression')
plt.show()

In [None]:
# Step 10: Feature Importances for Decision Tree
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': dec_tree.feature_importances_})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
feature_importances.plot(kind='barh', x='Feature', y='Importance', legend=False)
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importances - Decision Tree')
plt.gca().invert_yaxis()
plt.show()

In [None]:
# Step 11: Visualizing Tree 7 in Random Forest
plt.figure(figsize=(20, 10))
plot_tree(rand_forest.estimators_[6], feature_names=X.columns, filled=True, rounded=True, fontsize=10)
plt.title('Tree 7 in Random Forest')
plt.show()