<a href="https://colab.research.google.com/github/Adreena33/Projectentri/blob/main/Census_income_data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns


In [5]:
# Step 2: Load dataset
columns = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "sex",
    "capital_gain", "capital_loss", "hours_per_week", "native_country", "income"
]
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
                 names=columns, na_values=' ?', skipinitialspace=True)

In [6]:
# Step 3: Basic Data Exploration
def data_explore(df):
    print("Shape:", df.shape)
    print("Data Types:\n", df.dtypes)
    print("\nNumerical Columns:", df.select_dtypes(include=['int64', 'float64']).columns.tolist())
    print("Categorical Columns:", df.select_dtypes(include='object').columns.tolist())

data_explore(df)


Shape: (32561, 15)
Data Types:
 age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income            object
dtype: object

Numerical Columns: ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
Categorical Columns: ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'income']


In [8]:
# Step 4: Check and Handle Missing Values
def data_check(df):
    missing_percent = df.isna().mean() * 100
    print("\nMissing Values (%):\n", missing_percent[missing_percent > 0])

data_check(df)
# Drop rows with missing values
df.dropna(inplace=True)


Missing Values (%):
 Series([], dtype: float64)


In [9]:
# Step 5: Remove Duplicates
df.drop_duplicates(inplace=True)

In [10]:
# Step 6: Remove Outliers using IQR
def remove_outliers_iqr(df):
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    print("\nOutliers removed using IQR method.")
    return df

df = remove_outliers_iqr(df)


Outliers removed using IQR method.


In [11]:
# Step 7: Label Encoding for Categorical Columns
def label_encoding(df):
    label_encoder = LabelEncoder()
    cat_cols = df.select_dtypes(include='object').columns
    for col in cat_cols:
        df[col] = label_encoder.fit_transform(df[col])
    print("\nCategorical columns encoded.")

label_encoding(df)


Categorical columns encoded.


In [12]:
# Step 8: Feature Scaling
scaler = StandardScaler()
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [13]:
# Step 9: Show final shape
print("Final dataset shape after preprocessing:", df.shape)

Final dataset shape after preprocessing: (18991, 15)


In [15]:
# Step 10: Save preprocessed data to CSV
df.to_csv("preprocessed_census_data.csv", index=False)
print("File saved as 'preprocessed_census_data.csv'")

File saved as 'preprocessed_census_data.csv'
