# Import Required Libraries

In [23]:
import pandas as pd
import numpy as np

# Load the Dataset

In [24]:
headers = ['Age', 'Workclass', 'Education', 'Marital Status', 'Occupation','Race', 'Sex', 'Hours per week worked', 'Country', 'Income']
df = pd.read_csv('Data.csv', names=headers)
df.head()

Unnamed: 0,Age,Workclass,Education,Marital Status,Occupation,Race,Sex,Hours per week worked,Country,Income
0,39,State-gov,Bachelors,Never-married,Adm-clerical,White,Male,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,White,Male,13,United-States,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,White,Male,40,United-States,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Black,Male,40,United-States,<=50K
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Black,Female,40,Cuba,<=50K


# Explore and Clean the Data

In [25]:
df.replace('?', np.nan, inplace=True)

print(df.isnull().sum())

df_clean = df.dropna()
print(f"Rows after dropping missing values: {df_clean.shape[0]}")

Age                      0
Workclass                0
Education                0
Marital Status           0
Occupation               0
Race                     0
Sex                      0
Hours per week worked    0
Country                  0
Income                   0
dtype: int64
Rows after dropping missing values: 32561


# Encode Categorical Variables

In [None]:
df.replace('?', np.nan, inplace=True)

print(df.isnull().sum())

df_clean = df.dropna()
print(f"Rows after dropping missing values: {df_clean.shape[0]}")

Unnamed: 0,Age,Workclass,Education,Marital Status,Occupation,Race,Sex,Hours per week worked,Country,Income
0,39,7,9,4,1,4,1,40,39,0
1,50,6,9,2,4,4,1,13,39,0
2,38,4,11,0,6,4,1,40,39,0
3,53,4,1,2,6,2,1,40,39,0
4,28,4,9,2,10,2,0,40,5,0


# Split Data into Train and Test Sets

In [26]:
from sklearn.model_selection import train_test_split
X = df_clean.drop('Income', axis=1)
y = df_clean['Income']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train a Classification Model

In [15]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Evaluate Model Performance

In [19]:
from sklearn.metrics import accuracy_score
y_pred = clf.predict(X_test)

acc = accuracy_score(y_test, y_pred)

print(f"Accuracy: {acc:.4f}")

Accuracy: 0.8179


In [17]:
import pickle
with open('trained_model.pkl', 'wb') as file:
    pickle.dump(clf, file)

In [18]:
with open('trained_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)
print(loaded_model)

RandomForestClassifier(random_state=42)
