In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, precision_recall_fscore_support
from sklearn.preprocessing import StandardScaler, LabelEncoder


In [None]:
data = pd.read_csv('adult.csv')
data.replace('?', np.nan, inplace=True)
data.dropna(inplace=True)

label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']).columns:
    data[column] = label_encoder.fit_transform(data[column])

X = data.drop('income', axis=1)
y = data['income']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

cm = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix for Logistic Regression:")
print(cm)

print(f"Classification Report for Logistic Regression:")
print(classification_report(y_test, y_pred))

precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

auc = roc_auc_score(y_test, y_prob)
print(f"AUC: {auc:.2f}")

fpr, tpr, thresholds = roc_curve(y_test, y_prob)
plt.plot(fpr, tpr, label=f'Logistic Regression, AUC={auc:.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'ROC Curve for Logistic Regression')
plt.legend()
plt.show()

param_range = np.logspace(-4, 4, 10)
accuracy_scores = []
error_rates = []

for param in param_range:
    model.C = param
    cv_scores = cross_val_score(model, X_scaled, y, cv=5)
    accuracy = np.mean(cv_scores)
    error = 1 - accuracy

    accuracy_scores.append(accuracy)
    error_rates.append(error)

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(param_range, accuracy_scores, marker='o')
plt.title(f'Accuracy for Logistic Regression')
plt.xlabel('C (Inverse of Regularization Strength)')
plt.ylabel('Accuracy')

plt.subplot(1, 2, 2)
plt.plot(param_range, error_rates, marker='o')
plt.title(f'Error rate for Logistic Regression')
plt.xlabel('C (Inverse of Regularization Strength)')
plt.ylabel('Error rate')
plt.show()



In [None]:
#KNN
#Assuming dataset Since None was specified for this Problem
data = load_iris()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

k_values = [1, 3, 5, 7, 9]
accuracies = []
errors = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    error = 1 - accuracy
    errors.append(error)

best_k = k_values[accuracies.index(max(accuracies))]
print(f'Best performing k value: {best_k}')

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(k_values, accuracies, marker='o', linestyle='-', color='r')
plt.title('KNN Classifier Accuracy for different k values')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Accuracy')
plt.subplot(1, 2, 2)
plt.plot(k_values, errors, marker='o', linestyle='-', color='r')
plt.title('KNN Classifier Error for different k values')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Error')
plt.tight_layout()
plt.show()


In [None]:
#DECISION TREES

data = pd.read_csv('adult.csv')
data.dropna(inplace=True)

for column in data.select_dtypes(include=['object']).columns:
    data[column] = LabelEncoder().fit_transform(data[column])

X = data.drop('income', axis=1)
y = data['income']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

depths = [2, 4, 6, 8, 10]
min_samples = [1, 2, 4, 6, 8]
accuracies = []
errors = []
config_labels = []

for depth in depths:
    for min_sample in min_samples:
        clf = DecisionTreeClassifier(max_depth=depth, min_samples_leaf=min_sample, random_state=42)
        clf.fit(X_train, y_train)
        accuracy = accuracy_score(y_test, clf.predict(X_test))
        accuracies.append(accuracy)
        errors.append(1 - accuracy)
        config_labels.append(f'depth={depth}, min_samples={min_sample}')

best_index = accuracies.index(max(accuracies))
print(f'Best performing configuration: {config_labels[best_index]}')

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(config_labels, accuracies, marker='o', linestyle='-', color='b')
plt.xticks(rotation=45, ha='right')
plt.title('Decision Tree Accuracy for different configurations')
plt.ylabel('Accuracy')

plt.subplot(1, 2, 2)
plt.plot(config_labels, errors, marker='o', linestyle='-', color='r')
plt.xticks(rotation=45, ha='right')
plt.title('Decision Tree Error for different configurations')
plt.ylabel('Error')

plt.tight_layout()
plt.show()


In [None]:
#Random Forests

data = pd.read_csv('adult.csv')
data.dropna(inplace=True)

for column in data.select_dtypes(include=['object']).columns:
    data[column] = LabelEncoder().fit_transform(data[column])

X = data.drop('income', axis=1)
y = data['income']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

n_trees = [10, 50, 100, 150]
depths = [None, 5, 10, 15]
accuracies = []
errors = []
config_labels = []

for n in n_trees:
    for depth in depths:
        clf = RandomForestClassifier(n_estimators=n, max_depth=depth, random_state=42)
        clf.fit(X_train, y_train)
        accuracy = accuracy_score(y_test, clf.predict(X_test))
        accuracies.append(accuracy)
        errors.append(1 - accuracy)
        config_labels.append(f'trees={n}, depth={depth}')

best_index = accuracies.index(max(accuracies))
print(f'Best performing configuration: {config_labels[best_index]}')

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(config_labels, accuracies, marker='o', linestyle='-', color='b')
plt.xticks(rotation=45, ha='right')
plt.title('Random Forest Accuracy for different configurations')
plt.ylabel('Accuracy')

plt.subplot(1, 2, 2)
plt.plot(config_labels, errors, marker='o', linestyle='-', color='r')
plt.xticks(rotation=45, ha='right')
plt.title('Random Forest Error for different configurations')
plt.ylabel('Error')

plt.tight_layout()
plt.show()


In [None]:
#ADABoost

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

data = pd.read_csv('adult.csv')
data.dropna(inplace=True)

X = data.drop('income', axis=1)
y = data['income']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

n_estimators = [50, 100, 150, 200]
base_models = [DecisionTreeClassifier(max_depth=d) for d in [1, 3, 5]]
accuracies = []
errors = []
config_labels = []

for n in n_estimators:
    for base_model in base_models:
        clf = AdaBoostClassifier(base_estimator=base_model, n_estimators=n, random_state=42)
        clf.fit(X_train, y_train)
        accuracy = accuracy_score(y_test, clf.predict(X_test))
        accuracies.append(accuracy)
        errors.append(1 - accuracy)
        config_labels.append(f'estimators={n}, base_depth={base_model.max_depth}')

best_index = accuracies.index(max(accuracies))
print(f'Best performing configuration: {config_labels[best_index]}')

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(config_labels, accuracies, marker='o', linestyle='-', color='b')
plt.xticks(rotation=45, ha='right')
plt.title('AdaBoost Accuracy for different configurations')
plt.ylabel('Accuracy')

plt.subplot(1, 2, 2)
plt.plot(config_labels, errors, marker='o', linestyle='-', color='r')
plt.xticks(rotation=45, ha='right')
plt.title('AdaBoost Error for different configurations')
plt.ylabel('Error')

plt.tight_layout()
plt.show()


In [None]:
#STACKING
data = pd.read_csv('adult.csv')
data.dropna(inplace=True)

X = data.drop('income', axis=1)
y = data['income']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

base_models = [
    ('knn', KNeighborsClassifier()),
    ('rf', RandomForestClassifier()),
    ('dt', DecisionTreeClassifier())
]

meta_model = LogisticRegression()

stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)
stacking_model.fit(X_train, y_train)

y_pred = stacking_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
error = 1 - accuracy

print(f'Stacking model accuracy: {accuracy}')
print(f'Stacking model error: {error}')

plt.figure(figsize=(6, 5))
plt.bar(['Accuracy', 'Error'], [accuracy, error], color=['blue', 'red'])
plt.title('Stacking Model Performance')
plt.ylabel('Score')
plt.ylim(0, 1)
plt.show()
