In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_auc_score

In [11]:
# Read the data into a pandas DataFrame with custom delimiter
df_train = pd.read_csv('adult.data', header=None, delimiter=", ", engine='python')
df_test = pd.read_csv('adult.test', header=None, delimiter=", ", engine='python')

# Replace '?' with NaN for easier handling
df_train.replace('?', np.nan, inplace=True)
df_test.replace('?', np.nan, inplace=True)

# Drop rows with any missing values and reset index
df_train = df_train.dropna().reset_index(drop=True)
df_test = df_test.dropna().reset_index(drop=True)

# Extract the data (excluding the target column)
training_data = df_train.iloc[:, :-1]
testing_data = df_test.iloc[:, :-1]

# Extract the target variable and transform it
training_target = df_train.iloc[:, -1]
training_target = (training_target == '>50K').astype(int)
testing_target = df_test.iloc[:, -1]
testing_target = (testing_target == '>50K.').astype(int)

# Select categorical columns for one-hot encoding
categorical_columns = [1, 3, 5, 6, 7, 8, 9, 13]
numerical_columns = [0, 2, 4, 10, 11, 12]

# Apply one-hot encoding to categorical columns
encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' to avoid multicollinearity
encoded_training_data = encoder.fit_transform(training_data.iloc[:, categorical_columns])
encoded_testing_data = encoder.fit_transform(testing_data.iloc[:, categorical_columns])

# Concatenate encoded data with numerical columns
training_data = np.concatenate([encoded_training_data, training_data.iloc[:, numerical_columns]], axis=1)
testing_data = np.concatenate([encoded_testing_data, testing_data.iloc[:, numerical_columns]], axis=1)

In [12]:
X_train = training_data  # Features
y_train = training_target  # Target variable
X_test = testing_data
y_test = testing_target

# Train LDA model
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
predictions_lda = lda.predict(X_test)

# Calculate accuracy metrics for LDA
accuracy_lda = accuracy_score(y_test, predictions_lda)
precision_lda, recall_lda, f1_lda, _ = precision_recall_fscore_support(y_test, predictions_lda, average='binary')
auc_roc_lda = roc_auc_score(y_test, predictions_lda)

print("LDA:")
print(f"- Testing accuracy: {accuracy_lda:.4f}")
print(f"- Precision: {precision_lda:.4f}")
print(f"- Recall: {recall_lda:.4f}")
print(f"- F1-score: {f1_lda:.4f}")
print(f"- AUC-ROC: {auc_roc_lda:.4f}")

LDA:
- Testing accuracy: 0.8389
- Precision: 0.7164
- Recall: 0.5700
- F1-score: 0.6349
- AUC-ROC: 0.7482


In [13]:
# Train QDA model
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)
predictions_qda = qda.predict(X_test)

# Calculate accuracy metrics for LDA
accuracy_qda = accuracy_score(y_test, predictions_qda)
precision_qda, recall_qda, f1_qda, _ = precision_recall_fscore_support(y_test, predictions_qda, average='binary')
auc_roc_qda = roc_auc_score(y_test, predictions_qda)

print("QDA:")
print(f"- Testing accuracy: {accuracy_qda:.4f}")
print(f"- Precision: {precision_qda:.4f}")
print(f"- Recall: {recall_qda:.4f}")
print(f"- F1-score: {f1_qda:.4f}")
print(f"- AUC-ROC: {auc_roc_qda:.4f}")



QDA:
- Testing accuracy: 0.7529
- Precision: 0.3854
- Recall: 0.0100
- F1-score: 0.0195
- AUC-ROC: 0.5024


In [14]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)
predictions_log_reg = log_reg.predict(X_test_scaled)

# Calculate accuracy metrics for Logistic Regression
accuracy_log_reg = accuracy_score(y_test, predictions_log_reg)
precision_log_reg, recall_log_reg, f1_log_reg, _ = precision_recall_fscore_support(y_test, predictions_log_reg, average='binary')
auc_roc_log_reg = roc_auc_score(y_test, predictions_log_reg)

print("Logistic Regression:")
print(f"- Testing Accuracy: {accuracy_log_reg:.4f}")
print(f"- Precision: {precision_log_reg:.4f}")
print(f"- Recall: {recall_log_reg:.4f}")
print(f"- F1-score: {f1_log_reg:.4f}")
print(f"- AUC-ROC: {auc_roc_log_reg:.4f}")

Logistic Regression:
- Testing Accuracy: 0.8476
- Precision: 0.7292
- Recall: 0.6041
- F1-score: 0.6608
- AUC-ROC: 0.7655


In [15]:
# Train SVM model
svm = SVC(kernel='rbf')  # You can specify different kernels (e.g., 'linear', 'poly', 'rbf')
svm.fit(X_train_scaled, y_train)
predictions_svm = svm.predict(X_test_scaled)

# Calculate accuracy metrics for SVM
accuracy_svm = accuracy_score(y_test, predictions_svm)
precision_svm, recall_svm, f1_svm, _ = precision_recall_fscore_support(y_test, predictions_svm, average='binary')
auc_roc_svm = roc_auc_score(y_test, predictions_svm)

print("SVM:")
print(f"- Testing Accuracy: {accuracy_svm:.4f}")
print(f"- Precision: {precision_svm:.4f}")
print(f"- Recall: {recall_svm:.4f}")
print(f"- F1-score: {f1_svm:.4f}")
print(f"- AUC-ROC: {auc_roc_svm:.4f}")

SVM:
- Testing Accuracy: 0.8457
- Precision: 0.7409
- Recall: 0.5719
- F1-score: 0.6455
- AUC-ROC: 0.7534


In [16]:
# Train Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust n_estimators and other hyperparameters
rf_classifier.fit(X_train_scaled, y_train)
predictions_rf = rf_classifier.predict(X_test_scaled)

# Calculate accuracy metrics for Random Forest
accuracy_rf = accuracy_score(y_test, predictions_rf)
precision_rf, recall_rf, f1_rf, _ = precision_recall_fscore_support(y_test, predictions_rf, average='binary')
auc_roc_rf = roc_auc_score(y_test, predictions_rf)

print("Random Forest:")
print(f"- Testing Accuracy: {accuracy_rf:.4f}")
print(f"- Precision: {precision_rf:.4f}")
print(f"- Recall: {recall_rf:.4f}")
print(f"- F1-score: {f1_rf:.4f}")
print(f"- AUC-ROC: {auc_roc_rf:.4f}")

Random Forest:
- Testing Accuracy: 0.8485
- Precision: 0.7239
- Recall: 0.6200
- F1-score: 0.6679
- AUC-ROC: 0.7715


In [17]:
from sklearn.ensemble import GradientBoostingClassifier

# Train the Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)  # Adjust hyperparameters as needed
gb_classifier.fit(X_train_scaled, y_train)
predictions_gb = gb_classifier.predict(X_test_scaled)

# Calculate accuracy metrics
accuracy_gb = accuracy_score(y_test, predictions_gb)
precision_gb, recall_gb, f1_gb, _ = precision_recall_fscore_support(y_test, predictions_gb, average='binary')
auc_roc_gb = roc_auc_score(y_test, predictions_gb)

print("Gradient Boosting:")
print(f"- Testing Accuracy: {accuracy_gb:.4f}")
print(f"- Precision: {precision_gb:.4f}")
print(f"- Recall: {recall_gb:.4f}")
print(f"- F1-score: {f1_gb:.4f}")
print(f"- AUC-ROC: {auc_roc_gb:.4f}")

Gradient Boosting:
- Testing Accuracy: 0.8663
- Precision: 0.7929
- Recall: 0.6168
- F1-score: 0.6938
- AUC-ROC: 0.7821


In [18]:
# Print all accuracy metrics
print("LDA:")
print(f"- Testing accuracy: {accuracy_lda:.4f}")
print(f"- Precision: {precision_lda:.4f}")
print(f"- Recall: {recall_lda:.4f}")
print(f"- F1-score: {f1_lda:.4f}")
print(f"- AUC-ROC: {auc_roc_lda:.4f}")
print()

print("QDA:")
print(f"- Testing accuracy: {accuracy_qda:.4f}")
print(f"- Precision: {precision_qda:.4f}")
print(f"- Recall: {recall_qda:.4f}")
print(f"- F1-score: {f1_qda:.4f}")
print(f"- AUC-ROC: {auc_roc_qda:.4f}")
print()

print("Logistic Regression:")
print(f"- Testing Accuracy: {accuracy_log_reg:.4f}")
print(f"- Precision: {precision_log_reg:.4f}")
print(f"- Recall: {recall_log_reg:.4f}")
print(f"- F1-score: {f1_log_reg:.4f}")
print(f"- AUC-ROC: {auc_roc_log_reg:.4f}")
print()

print("SVM:")
print(f"- Testing Accuracy: {accuracy_svm:.4f}")
print(f"- Precision: {precision_svm:.4f}")
print(f"- Recall: {recall_svm:.4f}")
print(f"- F1-score: {f1_svm:.4f}")
print(f"- AUC-ROC: {auc_roc_svm:.4f}")
print()

print("Random Forest:")
print(f"- Testing Accuracy: {accuracy_rf:.4f}")
print(f"- Precision: {precision_rf:.4f}")
print(f"- Recall: {recall_rf:.4f}")
print(f"- F1-score: {f1_rf:.4f}")
print(f"- AUC-ROC: {auc_roc_rf:.4f}")
print()

print("Gradient Boosting:")
print(f"- Testing Accuracy: {accuracy_gb:.4f}")
print(f"- Precision: {precision_gb:.4f}")
print(f"- Recall: {recall_gb:.4f}")
print(f"- F1-score: {f1_gb:.4f}")
print(f"- AUC-ROC: {auc_roc_gb:.4f}")

LDA:
- Testing accuracy: 0.8389
- Precision: 0.7164
- Recall: 0.5700
- F1-score: 0.6349
- AUC-ROC: 0.7482

QDA:
- Testing accuracy: 0.7529
- Precision: 0.3854
- Recall: 0.0100
- F1-score: 0.0195
- AUC-ROC: 0.5024

Logistic Regression:
- Testing Accuracy: 0.8476
- Precision: 0.7292
- Recall: 0.6041
- F1-score: 0.6608
- AUC-ROC: 0.7655

SVM:
- Testing Accuracy: 0.8457
- Precision: 0.7409
- Recall: 0.5719
- F1-score: 0.6455
- AUC-ROC: 0.7534

Random Forest:
- Testing Accuracy: 0.8485
- Precision: 0.7239
- Recall: 0.6200
- F1-score: 0.6679
- AUC-ROC: 0.7715

Gradient Boosting:
- Testing Accuracy: 0.8663
- Precision: 0.7929
- Recall: 0.6168
- F1-score: 0.6938
- AUC-ROC: 0.7821
