# Income Classification Using Machine Learning

This project uses the Adult Income dataset to predict whether a person earns more than 50K per year based on various demographic attributes. 

The steps include:
- Data loading and exploration
- Data preprocessing
- Handling missing values
- Feature transformation
- Handling class imbalance using SMOTE
- Building and tuning models (LightGBM, GridSearchCV)
- Evaluation using classification metrics


In [None]:
import pandas as pd

In [None]:
import seaborn as sns

In [None]:
import numpy as np

In [None]:
from matplotlib import pyplot as plt

# Load the Dataset

In [None]:
df = pd.read_csv("adult income1.csv")

# Initial Exploration (df.head(), df.info(), df.describe())

In [None]:
df.head(5)

In [None]:
df.info()

In [None]:
df.describe()

# Missing Values + Data Cleaning

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df['income'].unique()

In [None]:
df['TARGET'] = df['income'].apply(lambda x: 1 if x == '>50K' else 0)

In [None]:
df.head(5)

In [None]:
df['occupation'].value_counts()

In [None]:
df['occupation'] = df['occupation'].replace('?', np.nan)

#  EDA (Exploratory Data Analysis)

In [None]:
df['TARGET'].value_counts()

In [None]:
# Income Distribution

plt.pie(df['TARGET'].value_counts(), explode=[0,0.1], wedgeprops={'edgecolor':'k', 'width':0.9}, shadow=True, colors=['green', 'yellow'],
        autopct='%1.1f%%', startangle=30)
plt.legend(loc='upper left', labels=['Below 50', 'Above 50'])
plt.show()

In [None]:
df['sex'].value_counts()

In [None]:
# Gender Distribution

plt.pie(df['sex'].value_counts(), explode=[0,0.1], wedgeprops={'edgecolor':'k', 'width':0.9}, shadow=True, colors=['red', 'orange'],
        autopct='%1.1f%%', startangle=50)
plt.legend(loc='upper left', labels=['Male', 'Female'])
plt.show()

In [None]:
sns.heatmap(df.corr(numeric_only=True), cmap='coolwarm')
plt.show()

In [None]:
sns.pairplot(df[['age', 'education.num', 'fnlwgt', 'TARGET']], hue='TARGET')

In [None]:
df.head(5)

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
ohe = OneHotEncoder(handle_unknown='ignore')

In [None]:
sm = SimpleImputer(strategy='most_frequent')

In [None]:
X = df.drop(columns={'education', 'fnlwgt', 'TARGET', 'income'})

In [None]:
y = df['TARGET']

# Data Preprocessing (Pipeline)

In [None]:
from sklearn.pipeline import Pipeline 

In [None]:
pipeline = Pipeline([
    ('imputer', sm),
    ('encoder', ohe),
])  

In [None]:
from sklearn.compose import make_column_transformer

In [None]:
columns = ['workclass', 'occupation', 'relationship', 'native.country', 'sex', 'race', 'marital.status']

In [None]:
# Column Transformer

col_transformer = make_column_transformer(
    (pipeline, columns),
    remainder='passthrough'
)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

# Model Setup: LightGBM

In [None]:
from lightgbm import LGBMClassifier

In [None]:
lgbm = LGBMClassifier(random_state=42, class_weight='balanced', verbosity=-1)

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smt = SMOTE(random_state=42)

# Pipeline + SMOTE

In [None]:
from imblearn.pipeline import Pipeline as imb_pipe

In [None]:
pipe = imb_pipe([
    ('preprocessor', col_transformer),
    ('smote', smt),
    ('classifier', lgbm)
])

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Model Tuning with GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [10, 15, 20],
    'classifier__learning_rate': [0.05, 0.1],
    'classifier__num_leaves': [30, 40, 50],
    'classifier__class_weight': ['balanced'],
    'classifier__boosting_type': ['gbdt'],
}

In [None]:
GSCV = GridSearchCV(pipe, param_grid, cv=skf, scoring='f1')

In [None]:
GSCV.fit(X_train, y_train)

In [None]:
GSCV_pre = GSCV.predict(X_test)

In [None]:
GSCV.best_params_

# Model Evaluation

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [None]:
cm = confusion_matrix(GSCV_pre, y_test)

In [None]:
print(cm)

In [None]:
asc = accuracy_score(GSCV_pre, y_test)

In [None]:
print(round(asc, 2))

In [None]:
cr = classification_report(GSCV_pre, y_test)

In [None]:
print(cr)