## Import Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

##  Load the Dataset

In [3]:
df = pd.read_csv('adult.csv')
print(df.head())

   age workclass  fnlwgt     education  education.num marital.status  \
0   90         ?   77053       HS-grad              9        Widowed   
1   82   Private  132870       HS-grad              9        Widowed   
2   66         ?  186061  Some-college             10        Widowed   
3   54   Private  140359       7th-8th              4       Divorced   
4   41   Private  264663  Some-college             10      Separated   

          occupation   relationship   race     sex  capital.gain  \
0                  ?  Not-in-family  White  Female             0   
1    Exec-managerial  Not-in-family  White  Female             0   
2                  ?      Unmarried  Black  Female             0   
3  Machine-op-inspct      Unmarried  White  Female             0   
4     Prof-specialty      Own-child  White  Female             0   

   capital.loss  hours.per.week native.country income  
0          4356              40  United-States  <=50K  
1          4356              18  United-States

## Inspect Dataset Info

In [5]:
print(df.info())

print("\n--- Describe ---")
print(df.describe(include='all'))

print("\n--- Null Values ---")
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None

--- Describe ---
                 age workclass        fnlwgt education  education.num 

## Handle Missing Values

In [6]:
df = df.replace(' ?', np.nan)

print("\n--- Nulls After Replace ---")
print(df.isnull().sum())

df = df.dropna()
print("\n--- Nulls After Drop ---")

print(df.isnull().sum())
print("\nNew shape:", df.shape)


--- Nulls After Replace ---
age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

--- Nulls After Drop ---
age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

New shape: (32561, 15)


## Encode Categorical Features

In [7]:
le = LabelEncoder()

for col in df.select_dtypes(include=['object']).columns:
    df[col] = le.fit_transform(df[col])

print(df.head())
print("\nData types after encoding:\n", df.dtypes)

   age  workclass  fnlwgt  education  education.num  marital.status  \
0   90          0   77053         11              9               6   
1   82          4  132870         11              9               6   
2   66          0  186061         15             10               6   
3   54          4  140359          5              4               0   
4   41          4  264663         15             10               5   

   occupation  relationship  race  sex  capital.gain  capital.loss  \
0           0             1     4    0             0          4356   
1           4             1     4    0             0          4356   
2           0             4     2    0             0          4356   
3           7             4     4    0             0          3900   
4          10             3     4    0             0          3900   

   hours.per.week  native.country  income  
0              40              39       0  
1              18              39       0  
2              40   

## Split Dataset into Features and Target

In [8]:
X = df.drop('income', axis=1)
y = df['income']

print("Features shape:", X.shape)
print("Target shape:", y.shape)

Features shape: (32561, 14)
Target shape: (32561,)


## Train and Test Sets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (22792, 14)
X_test shape: (9769, 14)
y_train shape: (22792,)
y_test shape: (9769,)


## Train and Evaluate Models

In [12]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n=== {name} ===")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



=== Logistic Regression ===
Accuracy: 0.8083734261439247

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.94      0.88      7429
           1       0.68      0.38      0.49      2340

    accuracy                           0.81      9769
   macro avg       0.75      0.66      0.68      9769
weighted avg       0.79      0.81      0.79      9769


=== Decision Tree ===
Accuracy: 0.8120585525642338

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.88      0.88      7429
           1       0.61      0.60      0.61      2340

    accuracy                           0.81      9769
   macro avg       0.74      0.74      0.74      9769
weighted avg       0.81      0.81      0.81      9769


=== Random Forest ===
Accuracy: 0.8574060804585935

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.93      0.91      7429
