In [None]:
#Importing libraries
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score, StratifiedKFold,  train_test_split, learning_curve,GridSearchCV 
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import time
from sklearn.svm import SVC
from scipy import stats
from scipy.stats import ttest_rel


In [None]:
#Reading the dataset
df = "adult.data"
df1 = "adult.test"

Data1 = pd.read_csv(df)
Data2 = pd.read_csv(df1)

In [None]:
#Exploring the dataset by viewing the structure a of the dataset
Data1.head()


In [None]:
#Exploring the dataset by viewing the data types of the data
Data1.dtypes

In [None]:
#Structuring the data with column names
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 
                'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
                'hours-per-week', 'native-country', 'income']

train_data = pd.read_csv(df, names=column_names, sep=',\s', na_values=["?"], engine='python')
test_data = pd.read_csv(df1, names=column_names, sep=',\s', na_values=["?"], engine='python')

In [None]:
#Viewing the categorized new train dataset
train_data.head(200)


In [None]:
#Viewing the categorized new test dataset
test_data.head(200)

In [None]:
#Removing the first row of the test data
test_data = test_data.iloc[1:]

In [None]:
train_data.head()

In [None]:
#Finding missing values for training data
print(train_data.isnull().sum())

In [None]:
#Finding missing values for test data
print(test_data.isnull().sum())

In [None]:
#Removing null values from both train and test data
data = train_data.dropna()
data2 = test_data.dropna()


In [None]:
#Label encoding the training data
converter = LabelEncoder() 
for column in ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']: 
    data.loc[:, column] = converter.fit_transform(data[column])


In [None]:
data.head()

In [None]:
#label encoding the test data
converter = LabelEncoder() 
for column in ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']: 
    data2.loc[:, column] = converter.fit_transform(data2[column])

In [None]:
data2.head()

In [None]:
#Forming the training and testing datasets
y_train = data['income']
y_test = data2['income']
x_train = data.drop('income', axis=1)
x_test = data2.drop('income', axis=1)

In [None]:
#Changing the data type of the target variable for train set
y_train = y_train.astype('int64')
y_train

In [None]:
#Changing the data type of the target variable for test set
y_test = y_test.astype('int64')
y_test

In [None]:
#Checking class imbalance
value_to_count_1 = 1
value_to_count_0 = 0


count_1 = (y_train == value_to_count_1).sum()
count_0 = (y_train == value_to_count_0).sum()


print(f"Count of 1s in column 'binary_column': {count_1}")
print(f"Count of 0s in column 'binary_column': {count_0}")

In [None]:
#Implementing Stratified k-fold cross validation for KNN

cv=5
knn = KNeighborsClassifier()
skf = StratifiedKFold(n_splits=cv)
scores = cross_val_score(knn, x_train, y_train, cv=skf) 
print("Cross-validation scores:", scores)
print("Mean cross-validation score:", scores.mean())

In [None]:
#Implementing Stratified k-fold cross validation for SVM

svm = SVC()
skf = StratifiedKFold(n_splits=cv)
scores = cross_val_score(svm, x_train, y_train, cv=skf) 
print("Cross-validation scores:", scores)
print("Mean cross-validation score:", scores.mean())


In [None]:
#Implementing Grid Search algorithm for KNN
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)


# Defining the KNN model
knn = KNeighborsClassifier(n_neighbors=5)

#Defining hyperparameters for tuning
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Initializing GridSearchCV
grid_search_knn = GridSearchCV(knn, param_grid_knn, cv=5)

#Fitting the model
grid_search_knn.fit(x_train, y_train)


best_knn = grid_search_knn.best_estimator_

# Evaluating on the test set and printing best parameters
y_pred_knn = best_knn.predict(x_test)
print("Best parameters for KNN:", grid_search_knn.best_params_)
print("Training set score: {:.3f}".format(best_knn.score(x_train, y_train)))
print("Test set score: {:.3f}".format(best_knn.score(x_test, y_test)))

In [None]:
#Implementing KNN Algorithm
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Create KNN model
start_time = time.time()
knn_model = KNeighborsClassifier(n_neighbors=9, metric = 'manhattan',weights = 'uniform' )
knn_model.fit(x_train, y_train)
training_time = time.time() - start_time


# Making predictions
start_time = time.time()
y_pred = knn_model.predict(x_test)
prediction_time = time.time() - start_time

# Evaluating the model
print(f"Training time for KNN: {training_time} seconds")
print(f"Prediction time for KNN: {prediction_time} seconds")
print("Training set score: {:.3f}".format(knn_model.score(x_train, y_train)))
print("Test set score: {:.3f}".format(knn_model.score(x_test, y_test)))
print(classification_report(y_test, y_pred))

# Displaying the Confusion Matrix
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, y_pred))
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Plot ROC-AUC curve for KNN
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)


plt.figure()
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Ploting Learning Curve for KNN

train_sizes, train_scores, test_scores = learning_curve(knn, x_train, y_train, cv=5, n_jobs=-1, 
                                                            train_sizes=np.linspace(0.1, 1.0, 5), scoring='accuracy')
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

plt.figure()
plt.title("KNN Learning Curve")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.grid()

plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")

plt.legend(loc="best")
plt.show()


In [None]:
# Calculate the confidence interval for accuracy
accuracy = accuracy_score(y_test, y_pred)
confidence = 0.95
n = len(y_test)
stderr = np.sqrt((accuracy * (1 - accuracy)) / n)
z_score = stats.norm.ppf((1 + confidence) / 2)  # For 95% CI
margin_of_error = z_score * stderr

ci_lower = accuracy - margin_of_error
ci_upper = accuracy + margin_of_error

print(f"Accuracy: {accuracy:.3f}")
print(f"95% Confidence Interval for accuracy: ({ci_lower:.3f}, {ci_upper:.3f})")

#Graph plot
plt.errorbar(x=1, y=accuracy, yerr=margin_of_error, fmt='o', capsize=5, capthick=2, ecolor='red', label='Accuracy with 95% CI')
plt.xlim(0.5, 1.5)
plt.xticks([])
plt.ylim(0, 1)
plt.ylabel('Accuracy')
plt.title('Model Accuracy with 95% Confidence Interval')
plt.legend()
plt.grid(True)
plt.show()

In [None]:

#Implementing Grid Search Algorithm for SVM

# Defining the SVM model
svm = SVC()

# Defining hyperparameters for tuning
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf']
}

# Initializing GridSearchCV
grid_search = GridSearchCV(svm, param_grid, refit=True, verbose=2, cv=5)

# Fitting the model
grid_search.fit(x_train, y_train)

# Printing the best parameters
print("Best Parameters:", grid_search.best_params_)

# Evaluating on the test set
y_pred = grid_search.predict(x_test)
print("Test Accuracy:", grid_search.score(x_test, y_test))
print("Training set score: {:.3f}".format(grid_search.score(x_train, y_train)))
print("Test set score: {:.3f}".format(grid_search.score(x_test, y_test)))



In [None]:
#Implementing SVM 

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Creating the SVM model
start_time = time.time()
svm = SVC(C= 100, gamma = 0.01, kernel = 'rbf')
svm.fit(x_train, y_train)
training_time = time.time() - start_time

# Making predictions
start_time = time.time()
y_pred = svm.predict(x_test)
prediction_time = time.time() - start_time

# Evaluating the model
print(f"Training time for SVM: {training_time} seconds")
print(f"Prediction time for SVM: {prediction_time} seconds")
print("Training set score: {:.3f}".format(svm.score(x_train, y_train)))
print("Test set score: {:.3f}".format(svm.score(x_test, y_test)))
print(classification_report(y_test, y_pred))

# Displaying the Confusion Matrix
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, y_pred))
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Plotting ROC-AUC curve

fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)


plt.figure()
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Plotting Learning curve

train_sizes, train_scores, test_scores = learning_curve(svm, x_train, y_train, cv=5, n_jobs=-1, 
                                                            train_sizes=np.linspace(0.1, 1.0, 5), scoring='accuracy')
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

plt.figure()
plt.title("SVM Learning Curve")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.grid()

plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")

plt.legend(loc="best")
plt.show()


In [None]:
# Calculate the confidence interval for accuracy
accuracy = accuracy_score(y_test, y_pred)
confidence = 0.95
n = len(y_test)
stderr = np.sqrt((accuracy * (1 - accuracy)) / n)
z_score = stats.norm.ppf((1 + confidence) / 2)  # For 95% CI
margin_of_error = z_score * stderr

ci_lower = accuracy - margin_of_error
ci_upper = accuracy + margin_of_error

print(f"Accuracy: {accuracy:.3f}")
print(f"95% Confidence Interval for accuracy: ({ci_lower:.3f}, {ci_upper:.3f})")

#Graph plot
plt.errorbar(x=1, y=accuracy, yerr=margin_of_error, fmt='o', capsize=5, capthick=2, ecolor='red', label='Accuracy with 95% CI')
plt.xlim(0.5, 1.5)
plt.xticks([])
plt.ylim(0, 1)
plt.ylabel('Accuracy')
plt.title('Model Accuracy with 95% Confidence Interval')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
#Implementing Stratified k-fold cross validation again to obtain accuracy values of improved model

#KNN
cv=5
knn = KNeighborsClassifier()
skf = StratifiedKFold(n_splits=cv)
scores = cross_val_score(knn, x_train, y_train, cv=skf) 
print("Cross-validation scores for KNN:", scores)

#SVM
svm = SVC()
skf = StratifiedKFold(n_splits=cv)
scores = cross_val_score(svm, x_train, y_train, cv=skf) 
print("Cross-validation scores for SVM:", scores)



In [None]:
# Performing t-test
knn_scores = [0.81, 0.82, 0.82, 0.83, 0.82]
svm_scores = [0.84, 0.84, 0.84, 0.85, 0.84]

t_stat, p_value = ttest_rel(knn_scores, svm_scores)

print(f"t-statistic: {t_stat:.3f}, p-value: {p_value:.5f}")