<a href="https://colab.research.google.com/github/A-Burnhard/Wine_data_analysis/blob/main/wine_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [None]:
#Loading or importing  wine dataset to notebook
cols = ["class", "Alcohol", "Malicacid", "Ash", "Alcalinity_of_ash", "Magnesium", "Total_phenols", "Flavanoids", "Nonflavanoid_phenols", "Proanthocyanins", "Color_intensity", "Hue", "0D280_0D315_of_diluted_wines", "Proline" ]
wine_data = pd.read_csv("wine.data", names = cols)
wine_data.head()

In [None]:
#split the data into training and validation sets
wine_features = wine_data.drop("class", axis=1)
wine_labels = wine_data["class"]
wine_train_features, wine_val_features, wine_train_labels, wine_val_labels = train_test_split(wine_features, wine_labels, test_size =0.2, random_state=42)



In [None]:
# train and evaluate the decision tree
wine_tree = DecisionTreeClassifier(criterion="gini", max_depth=3, random_state=42 )
wine_tree.fit(wine_train_features, wine_train_labels)
wine_val_predictions = wine_tree.predict(wine_val_features)
wine_classification_report = classification_report(wine_val_labels, wine_val_predictions)
print("Wine Decision Tree Classification Report:")
print(wine_classification_report)


In [None]:
#determine confused class pairs
wine_confusion_matrix  = pd.crosstab(wine_val_labels, wine_val_predictions)
print("Wine Confusion Matrix:")
print(wine_confusion_matrix)

In [None]:
#selecting a confused pair and identifying the corresponding leaf nodes
confused_classes = [(1,2)]
for class_pair in confused_classes:
  class1, class2 = class_pair
  misclassified_samples = wine_val_features[(wine_val_labels == class1) & (wine_val_predictions == class2)]
  leaf_nodes = wine_tree.apply(misclassified_samples)
  print(f"Leaf nodes corresponding to misclassification between class {class1} and class {class2}:")
  #print leaf nodes
  print(leaf_nodes)

In [None]:
# Compare decision trees with different attribute subsets
attribute_subsets = [
    ["Alcohol", "Malicacid", "Ash", "Alcalinity_of_ash", "Magnesium"],  # Subset 1
    ["Total_phenols", "Flavanoids", "Nonflavanoid_phenols", "Proanthocyanins"],  # Subset 2
    ["Color_intensity", "Hue", "0D280_0D315_of_diluted_wines", "Proline"]  # Subset 3
]

original_tree = DecisionTreeClassifier(criterion="gini", max_depth=None, random_state=42)
original_tree.fit(wine_train_features, wine_train_labels)
original_val_predictions = original_tree.predict(wine_val_features)
original_classification_report = classification_report(wine_val_labels, original_val_predictions)
print("Original Decision Tree Classification Report:")
print(original_classification_report)

for subset in attribute_subsets:
    subset_train_features = wine_train_features[subset]
    subset_val_features = wine_val_features[subset]

    subset_tree = DecisionTreeClassifier(criterion="gini", max_depth=None, random_state=42)
    subset_tree.fit(subset_train_features, wine_train_labels)
    subset_val_predictions = subset_tree.predict(subset_val_features)
    subset_classification_report = classification_report(wine_val_labels, subset_val_predictions)

    print(f"Subset: {subset}")
    print(f"Subset Decision Tree Classification Report:")
    print(subset_classification_report)