<a href="https://colab.research.google.com/github/A-Burnhard/Wine_data_analysis/blob/main/wine_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier


In [None]:
#Loading or importing  wine dataset to notebook
cols = ["class", "Alcohol", "Malicacid", "Ash", "Alcalinity_of_ash", "Magnesium", "Total_phenols", "Flavanoids", "Nonflavanoid_phenols", "Proanthocyanins", "Color_intensity", "Hue", "0D280_0D315_of_diluted_wines", "Proline" ]
wine_data = pd.read_csv("wine.data", names = cols)
wine_data.head()

In [3]:
#split the data into training and validation sets
wine_features = wine_data.drop("class", axis=1)
wine_labels = wine_data["class"]
wine_train_features, wine_val_features, wine_train_labels, wine_val_labels = train_test_split(wine_features, wine_labels, test_size =0.2, random_state=42)



**Constructing decision trees**

In [20]:
from sklearn.metrics import classification_report
# Define the number of trees to generate from each partition
num_trees = 3

# Initialize a list to store the decision trees and classification reports
trees = []

# Generate multiple decision trees
for i in range(num_trees):
    # Split the data into different partitions for each tree
    wine_train_features, wine_val_features, wine_train_labels, wine_val_labels = train_test_split(wine_features, wine_labels, test_size=0.2, random_state=i)

    # Initialize a new decision tree
    wine_tree = DecisionTreeClassifier(criterion="gini", max_depth=3, random_state=42)

    # Train the decision tree
    wine_tree.fit(wine_train_features, wine_train_labels)

    # Predict labels for validation set
    wine_val_predictions = wine_tree.predict(wine_val_features)

    # Evaluate classification performance
    wine_classification_report = classification_report(wine_val_labels, wine_val_predictions)

    # Store the decision tree and classification report
    trees.append((wine_tree, wine_classification_report))

# Print the classification reports for all the decision trees
for i, (tree, classification_report) in enumerate(trees):
    print(f"Decision Tree {i+1} Classification Report:")
    print(classification_report)
    print()

Decision Tree 1 Classification Report:
              precision    recall  f1-score   support

           1       0.93      1.00      0.97        14
           2       1.00      0.94      0.97        16
           3       1.00      1.00      1.00         6

    accuracy                           0.97        36
   macro avg       0.98      0.98      0.98        36
weighted avg       0.97      0.97      0.97        36


Decision Tree 2 Classification Report:
              precision    recall  f1-score   support

           1       0.93      0.93      0.93        14
           2       0.75      0.92      0.83        13
           3       1.00      0.67      0.80         9

    accuracy                           0.86        36
   macro avg       0.89      0.84      0.85        36
weighted avg       0.88      0.86      0.86        36


Decision Tree 3 Classification Report:
              precision    recall  f1-score   support

           1       1.00      0.89      0.94        18
          

**Visualizing classification report**

In [None]:
# Visualizing Classification report for the three trees
import matplotlib.pyplot as plt

# Initialize empty lists for precision, recall, and F1-score
precision = []
recall = []
f1_score = []

# Extract precision, recall, and F1-score from the classification reports
for _, wine_classification_report in trees:
    report_lines = wine_classification_report.split("\n")
    class_1_values = report_lines[2].split()[1:]
    class_2_values = report_lines[3].split()[1:]
    class_3_values = report_lines[4].split()[1:]
    precision.append([float(class_1_values[0]), float(class_2_values[0]), float(class_3_values[0])])
    recall.append([float(class_1_values[1]), float(class_2_values[1]), float(class_3_values[1])])
    f1_score.append([float(class_1_values[2]), float(class_2_values[2]), float(class_3_values[2])])

# Create a bar plot for precision
plt.figure(figsize=(10, 5))
bar_width = 0.25
index = [1, 2, 3]
labels = ['Class 1', 'Class 2', 'Class 3']

for i in range(num_trees):
    plt.bar([x + i * bar_width for x in index], precision[i], width=bar_width, label=f"Tree {i+1}")

plt.xlabel('Classes')
plt.ylabel('Precision')
plt.title('Precision for Decision Trees')
plt.xticks([x + bar_width for x in index], labels)
plt.legend()
plt.show()

# Create a bar plot for recall
plt.figure(figsize=(10, 5))

for i in range(num_trees):
    plt.bar([x + i * bar_width for x in index], recall[i], width=bar_width, label=f"Tree {i+1}")

plt.xlabel('Classes')
plt.ylabel('Recall')
plt.title('Recall for Decision Trees')
plt.xticks([x + bar_width for x in index], labels)
plt.legend()
plt.show()

# Create a bar plot for F1-score
plt.figure(figsize=(10, 5))

for i in range(num_trees):
    plt.bar([x + i * bar_width for x in index], f1_score[i], width=bar_width, label=f"Tree {i+1}")

plt.xlabel('Classes')
plt.ylabel('F1-Score')
plt.title('F1-Score for Decision Trees')
plt.xticks([x + bar_width for x in index], labels)
plt.legend()
plt.show()


**Determining confused class pairs**

In [6]:
#determine confused class pairs
wine_confusion_matrix  = pd.crosstab(wine_val_labels, wine_val_predictions)
print("Wine Confusion Matrix:")
print(wine_confusion_matrix)

Wine Confusion Matrix:
col_0   1  2  3
class          
1      16  2  0
2       0  9  0
3       0  0  9


**Visualizing** **Confusion Matrix**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create a heatmap of the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(wine_confusion_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
plt.title('Wine Confusion Matrix')
plt.show()


In [None]:
#selecting a confused pair and identifying the corresponding leaf nodes
confused_classes = [(1,2)]
for class_pair in confused_classes:
  class1, class2 = class_pair
  misclassified_samples = wine_val_features[(wine_val_labels == class1) & (wine_val_predictions == class2)]
  leaf_nodes = wine_tree.apply(misclassified_samples)
  print(f"Leaf nodes corresponding to misclassification between class {class1} and class {class2}:")
  #print leaf nodes
  print(leaf_nodes)

In [None]:
# Compare decision trees with different attribute subsets
attribute_subsets = [
    ["Alcohol", "Malicacid", "Ash", "Alcalinity_of_ash", "Magnesium"],  # Subset 1
    ["Total_phenols", "Flavanoids", "Nonflavanoid_phenols", "Proanthocyanins"],  # Subset 2
    ["Color_intensity", "Hue", "0D280_0D315_of_diluted_wines", "Proline"]  # Subset 3
]

original_tree = DecisionTreeClassifier(criterion="gini", max_depth=None, random_state=42)
original_tree.fit(wine_train_features, wine_train_labels)
original_val_predictions = original_tree.predict(wine_val_features)
original_classification_report = classification_report(wine_val_labels, original_val_predictions)
print("Original Decision Tree Classification Report:")
print(original_classification_report)

for subset in attribute_subsets:
    subset_train_features = wine_train_features[subset]
    subset_val_features = wine_val_features[subset]

    subset_tree = DecisionTreeClassifier(criterion="gini", max_depth=None, random_state=42)
    subset_tree.fit(subset_train_features, wine_train_labels)
    subset_val_predictions = subset_tree.predict(subset_val_features)
    subset_classification_report = classification_report(wine_val_labels, subset_val_predictions)

    print(f"Subset: {subset}")
    print(f"Subset Decision Tree Classification Report:")
    print(subset_classification_report)