## Importing Libraries

In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


## Loading the Data

In [2]:
train = pd.read_csv("au_train.csv")
test = pd.read_csv("au_test.csv")


## Data Preprocessing

In [3]:
# Removing duplicates
train.drop_duplicates(inplace=True)
test.drop_duplicates(inplace=True)

In [4]:
# Removing whitespaces
object_column = train.select_dtypes(include=['object']).columns
for col in object_column:
    train[col] = train[col].str.strip()
    test[col] = test[col].str.strip()

In [5]:
# Replacing '?' with NaN
train.replace('?', np.nan, inplace=True)
test.replace('?', np.nan, inplace=True)

In [6]:
# Filling missing values
for col in ['workclass', 'occupation', 'native-country']:
    train[col].fillna(train[col].mode()[0], inplace=True)
    test[col].fillna(test[col].mode()[0], inplace=True)

In [7]:
# Removing irrelevant column
train.drop(['education'], axis=1, inplace=True)
test.drop(['education'], axis=1, inplace=True)

## Separating Features and Target

In [8]:
# Spliting features and target
y_train = train['class']
X_train = train.drop('class', axis=1)

In [9]:
y_test = test['class']
X_test = test.drop('class', axis=1)

## Encoding Target Variable

In [36]:
# Converting target to numeric
y_test_cleaned = y_test.str.strip().str.replace('.', '', regex=False)
y_train_cleaned = y_train.str.strip().str.replace('.', '', regex=False)

y_test_num = y_test_cleaned.map({'<=50K': 0, '>50K': 1})
y_train_num = y_train_cleaned.map({'<=50K': 0, '>50K': 1})


## Target Encoding for High-Cardinality Features

In [22]:
nominal_attributes = X_train.select_dtypes(include='object').columns.tolist()
def target_encoder(attribute, target):
    temp = X_train.copy()
    temp['income'] = target
    encoding_map = {
        value: round(temp[temp[attribute] == value]['income'].mean(), 2)
        for value in temp[attribute].unique()
    }
    return encoding_map

for col in nominal_attributes:
    if col not in ['race', 'sex']:  # Leave these for one-hot encoding
        encoding = target_encoder(col, y_train_num)
        X_train[col] = X_train[col].map(encoding).fillna(0)
        X_test[col] = X_test[col].map(encoding).fillna(0)

In [24]:
print("Columns before one-hot encoding:", X_train.columns.tolist())


Columns before one-hot encoding: ['age', 'workclass', 'fnlwgt', 'education-num', 'marital-status', 'occupation', 'relationship', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'race_Asian-Pac-Islander', 'race_Black', 'race_Other', 'race_White', 'sex_Male']


## One-hot encode 'race' and 'sex'


In [23]:
one_hot_encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
def oneHotEncode(X_train, X_test, attribute):
    train_enc = one_hot_encoder.fit_transform(X_train[[attribute]]).toarray()
    test_enc = one_hot_encoder.transform(X_test[[attribute]]).toarray()
    col_names = one_hot_encoder.get_feature_names_out([attribute])
    train_df = pd.DataFrame(train_enc, columns=col_names)
    test_df = pd.DataFrame(test_enc, columns=col_names)
    X_train = pd.concat([X_train.reset_index(drop=True), train_df], axis=1).drop(attribute, axis=1)
    X_test = pd.concat([X_test.reset_index(drop=True), test_df], axis=1).drop(attribute, axis=1)
    return X_train, X_test

for col in ['race', 'sex']:
    X_train, X_test = oneHotEncode(X_train, X_test, col)

KeyError: "None of [Index(['race'], dtype='object')] are in the [columns]"

## Feature Scaling

In [25]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Training logistic regression model


In [51]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train_num)

In [54]:
print(X_train_scaled.shape) 
print(y_train_num.shape)  

(32537, 16)
(32537,)


## Evaluation on Training Data

In [55]:
y_train_pred = log_reg.predict(X_train_scaled)

cm_train = confusion_matrix(y_train_num, y_train_pred)
print("Confusion Matrix (Train):\n", cm_train)

Confusion Matrix (Train):
 [[22986  1712]
 [ 3242  4597]]


In [56]:
print("Accuracy (Train):", accuracy_score(y_train_num, y_train_pred))
print("Precision (Train):", precision_score(y_train_num, y_train_pred))
print("Recall (Train):", recall_score(y_train_num, y_train_pred))
print("F1 Score (Train):", f1_score(y_train_num, y_train_pred))

Accuracy (Train): 0.847742569997234
Precision (Train): 0.7286416230781423
Recall (Train): 0.5864268401581835
F1 Score (Train): 0.6498445009895392


## Evaluation on Testing Data

In [27]:
y_pred = log_reg.predict(X_test_scaled)

In [39]:
print("Confusion Matrix:\n", confusion_matrix(y_test_num, y_pred))

Confusion Matrix:
 [[11585   845]
 [ 1619  2227]]


In [57]:
print("Accuracy:", accuracy_score(y_test_num, y_pred))
print("Precision:", precision_score(y_test_num, y_pred))
print("Recall:", recall_score(y_test_num, y_pred))
print("F1 Score:", f1_score(y_test_num, y_pred))

Accuracy: 0.8486114524453182
Precision: 0.7249348958333334
Recall: 0.5790431617264691
F1 Score: 0.6438276958658572
