# **Classification Using Decision Tree**

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix


data = pd.read_csv("wine_data.csv")


X = data.drop("quality", axis=1)
y = data["quality"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [34]:
clf = DecisionTreeClassifier(
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=20,
    random_state=0
)


clf.fit(X_train, y_train)


y_train_pred = clf.predict(X_train)
y_val_pred = clf.predict(X_val)


train_accuracy = accuracy_score(y_train, y_train_pred)
val_accuracy = accuracy_score(y_val, y_val_pred)


train_cm = confusion_matrix(y_train, y_train_pred)
val_cm = confusion_matrix(y_val, y_val_pred)

print("Task 1: Train-Test Split Results")
print(f"Training Accuracy: {train_accuracy:.4f}")
print("Training Confusion Matrix:\n", train_cm)
print(f"Validation Accuracy: {val_accuracy:.4f}")
print("Validation Confusion Matrix:\n", val_cm)





Task 1: Train-Test Split Results
Training Accuracy: 0.7967
Training Confusion Matrix:
 [[417  93]
 [128 449]]
Validation Accuracy: 0.7279
Validation Confusion Matrix:
 [[ 92  36]
 [ 38 106]]


In [35]:

dummy_test = pd.read_csv("wine_data_test.csv")
X_dummy_test = dummy_test.drop("quality", axis=1)
y_dummy_test = dummy_test["quality"]

y_dummy_pred = clf.predict(X_dummy_test)

dummy_accuracy = accuracy_score(y_dummy_test, y_dummy_pred)
dummy_cm = confusion_matrix(y_dummy_test, y_dummy_pred)

print("\nTask 2: Dummy Test Set Results")
print(f"Dummy Test Accuracy: {dummy_accuracy:.4f}")
print("Dummy Test Confusion Matrix:\n", dummy_cm)



Task 2: Dummy Test Set Results
Dummy Test Accuracy: 0.7800
Dummy Test Confusion Matrix:
 [[36 12]
 [10 42]]


In [36]:

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
fold_accuracies = []

print("\nTask 3: 10-Fold Stratified Cross-Validation Results")
for i, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

    clf_cv = clf = DecisionTreeClassifier(
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=20,
    random_state=0
)
    clf_cv.fit(X_train_fold, y_train_fold)
    y_pred_fold = clf_cv.predict(X_test_fold)
    acc = accuracy_score(y_test_fold, y_pred_fold)
    fold_accuracies.append(acc)
    print(f"Fold {i} Accuracy: {acc:.4f}")

print(f"Average Cross-Validation Accuracy: {sum(fold_accuracies)/len(fold_accuracies):.4f}")


Task 3: 10-Fold Stratified Cross-Validation Results
Fold 1 Accuracy: 0.6838
Fold 2 Accuracy: 0.7132
Fold 3 Accuracy: 0.7794
Fold 4 Accuracy: 0.6765
Fold 5 Accuracy: 0.7941
Fold 6 Accuracy: 0.7132
Fold 7 Accuracy: 0.7132
Fold 8 Accuracy: 0.7206
Fold 9 Accuracy: 0.6471
Fold 10 Accuracy: 0.6963
Average Cross-Validation Accuracy: 0.7137


# **Classification Using SVM**

In [32]:
from sklearn.svm import SVC
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix


data = pd.read_csv("wine_data.csv")


X = data.drop("quality", axis=1)
y = data["quality"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
model = SVC(kernel='linear', random_state=0)



model.fit(X_train, y_train)


print("Training Accuracy:", accuracy_score(y_train, model.predict(X_train)))
print("Training Confusion Matrix:\n", confusion_matrix(y_train, model.predict(X_train)))

print("\nValidation Accuracy:", accuracy_score(y_val, model.predict(X_val)))
print("Validation Confusion Matrix:\n", confusion_matrix(y_val, model.predict(X_val)))


test_data = pd.read_csv("wine_data_test.csv")
X_test = test_data.drop("quality", axis=1)
y_test = test_data["quality"]

print("\nTest Accuracy:", accuracy_score(y_test, model.predict(X_test)))
print("Test Confusion Matrix:\n", confusion_matrix(y_test, model.predict(X_test)))


cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
cv_scores = cross_val_score(model, X, y, cv=cv)

print("\nCross-validation scores:", cv_scores)
print("Average cross-validation accuracy:", cv_scores.mean())

Training Accuracy: 0.7460901563937442
Training Confusion Matrix:
 [[392 118]
 [158 419]]

Validation Accuracy: 0.6838235294117647
Validation Confusion Matrix:
 [[87 41]
 [45 99]]

Test Accuracy: 0.75
Test Confusion Matrix:
 [[36 12]
 [13 39]]

Cross-validation scores: [0.69117647 0.70588235 0.78676471 0.69852941 0.77941176 0.70588235
 0.77205882 0.69117647 0.69852941 0.71111111]
Average cross-validation accuracy: 0.7240522875816994
