In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [13]:
# Load adult dataset
adult_data = pd.read_csv(r"E:\Projects\Census income prediction\adult.csv")

# Load population data
population_data = pd.read_csv(r"E:\Projects\Census income prediction\popdata.csv")

# Display first few rows of the adult dataset
print(adult_data.head())

   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
4   18          ?  103497  Some-college               10       Never-married   

          occupation relationship   race  gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4                  ?    Own-child  White  Female             0             0   

   hours-per-week native-country incom

In [14]:
# Display summary statistics of the adult dataset
print(adult_data.describe())

                age        fnlwgt  educational-num  capital-gain  \
count  48842.000000  4.884200e+04     48842.000000  48842.000000   
mean      38.643585  1.896641e+05        10.078089   1079.067626   
std       13.710510  1.056040e+05         2.570973   7452.019058   
min       17.000000  1.228500e+04         1.000000      0.000000   
25%       28.000000  1.175505e+05         9.000000      0.000000   
50%       37.000000  1.781445e+05        10.000000      0.000000   
75%       48.000000  2.376420e+05        12.000000      0.000000   
max       90.000000  1.490400e+06        16.000000  99999.000000   

       capital-loss  hours-per-week  
count  48842.000000    48842.000000  
mean      87.502314       40.422382  
std      403.004552       12.391444  
min        0.000000        1.000000  
25%        0.000000       40.000000  
50%        0.000000       40.000000  
75%        0.000000       45.000000  
max     4356.000000       99.000000  


In [15]:
# Check for missing values
print(adult_data.isnull().sum())

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64


In [17]:
#Data Preprocessing

# Perform label encoding for categorical variables
label_encoder = LabelEncoder()
categorical_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']
for col in categorical_cols:
    adult_data[col] = label_encoder.fit_transform(adult_data[col])

In [18]:
# Perform feature scaling using StandardScaler
scaler = StandardScaler()
numeric_cols = ['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']
adult_data[numeric_cols] = scaler.fit_transform(adult_data[numeric_cols])

In [19]:
# Split the data into features (X) and target variable (y)
X = adult_data.drop(columns=['income'])
y = adult_data['income']

In [20]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# Initialize and train the decision tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

In [22]:
# Predict on the test set
dt_predictions = dt_classifier.predict(X_test)

In [23]:
# Evaluate the model
dt_accuracy = accuracy_score(y_test, dt_predictions)
print("Decision Tree Classifier Accuracy:", dt_accuracy)

Decision Tree Classifier Accuracy: 0.8153342204933974


In [24]:
# Display classification report
print(classification_report(y_test, dt_predictions))

              precision    recall  f1-score   support

           0       0.88      0.87      0.88      7479
           1       0.60      0.62      0.61      2290

    accuracy                           0.82      9769
   macro avg       0.74      0.75      0.75      9769
weighted avg       0.82      0.82      0.82      9769



In [25]:
# Display confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, dt_predictions))

Confusion Matrix:
[[6541  938]
 [ 866 1424]]


In [26]:
#Random Forest

In [27]:
# Initialize and train the random forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Predict on the test set
rf_predictions = rf_classifier.predict(X_test)

In [28]:
# Evaluate the model
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Classifier Accuracy:", rf_accuracy)

# Display classification report
print(classification_report(y_test, rf_predictions))

Random Forest Classifier Accuracy: 0.8638550516941345
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      7479
           1       0.74      0.64      0.69      2290

    accuracy                           0.86      9769
   macro avg       0.82      0.79      0.80      9769
weighted avg       0.86      0.86      0.86      9769



In [29]:
# Display confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_predictions))

Confusion Matrix:
[[6967  512]
 [ 818 1472]]
