In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import pickle
import warnings
warnings.filterwarnings("ignore")


In [3]:
# Load data
df = pd.read_csv("adult_3.csv")


In [4]:
# Standardize column names
df.columns = df.columns.str.strip().str.lower()
df.rename(columns={'educational-num': 'education_num'}, inplace=True)

In [5]:
# Encode target 
#converting the string output to 0,1
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['income'] = le.fit_transform(df['income']) 

In [6]:
# Drop target column for X
X_full = pd.get_dummies(df.drop('income', axis=1))
y = df['income']

In [7]:
from sklearn.ensemble import RandomForestClassifier
model_all = RandomForestClassifier(random_state=42)
model_all.fit(X_full, y)


In [8]:
importances = model_all.feature_importances_
features_list = X_full.columns

# Get top N (say, 5)
top_n = 5
indices = importances.argsort()[::-1][:top_n]

top_features = [features_list[i] for i in indices]
print("Top 5 Features:", top_features)


Top 5 Features: ['fnlwgt', 'age', 'capital-gain', 'hours-per-week', 'marital-status_Married-civ-spouse']


In [9]:
# Use these top features
top_features = ['fnlwgt', 'age', 'capital-gain', 'hours-per-week', 'marital-status_Married-civ-spouse']

# Rebuild dataset using only these
X = X_full[top_features]


In [10]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
# Model training
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


In [12]:
# Evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7943494728221927
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      7479
           1       0.57      0.51      0.54      2290

    accuracy                           0.79      9769
   macro avg       0.71      0.70      0.70      9769
weighted avg       0.79      0.79      0.79      9769



In [13]:
import pickle

with open("salary_predictor_model.pkl", "wb") as f:
    pickle.dump(model, f)


In [14]:
import os
print("File exists?", os.path.isfile("salary_predictor_model.pkl"))


File exists? True


In [15]:
import pickle
import numpy as np

# ✅ Load trained model
with open("salary_predictor_model.pkl", "rb") as f:
    model = pickle.load(f)

# 🚀 Simulated user inputs
age = 30
fnlwgt = 200000
capital_gain = 0
hours = 40
married = "Married-civ-spouse"  # Or "Not Married"

# 🔁 One-hot encoding for marital status
married_status = 1 if married == "Married-civ-spouse" else 0

# 📦 Combine input in correct format
input_data = np.array([[fnlwgt, age, capital_gain, hours, married_status]])

# 🔮 Make prediction
prediction = model.predict(input_data)[0]

# 🧾 Show result
result = ">50K" if prediction == 1 else "<=50K"
print(f"Predicted Salary Class: {result}")


Predicted Salary Class: <=50K


In [16]:
!pip install streamlit




In [17]:
!streamlit run app.py


[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://192.168.0.108:8501[0m
[0m
^C
[34m  Stopping...[0m


In [18]:
import streamlit as st
import pickle
import numpy as np

# Load the trained model
with open("salary_predictor_model.pkl", "rb") as f:
    model = pickle.load(f)

st.title("💼 Employee Salary Prediction App")
st.write("Enter employee details to predict if their income is >50K or <=50K")

# Correct features matching model
age = st.slider("Age", 18, 90, 30)
fnlwgt = st.number_input("fnlwgt (Census weight)", min_value=10000, max_value=1000000, value=200000)
capital_gain = st.number_input("Capital Gain", 0, 100000, 0)
hours = st.slider("Hours per Week", 1, 80, 40)
married = st.selectbox("Marital Status", ["Married-civ-spouse", "Other"])

# One-hot encode marital status (as model expects 'marital-status_Married-civ-spouse')
married_status = 1 if married == "Married-civ-spouse" else 0

# Input order must match training
input_data = np.array([[fnlwgt, age, capital_gain, hours, married_status]])

if st.button("Predict Salary Class"):
    prediction = model.predict(input_data)[0]
    result = ">50K" if prediction == 1 else "<=50K"
    st.success(f"💰 Predicted Salary Class: {result}")


2025-07-19 07:53:32.816 
  command:

    streamlit run /opt/anaconda3/lib/python3.13/site-packages/ipykernel_launcher.py [ARGUMENTS]
2025-07-19 07:53:32.818 Session state does not function when running a script without `streamlit run`


In [19]:
!streamlit run app.py


[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://192.168.0.108:8501[0m
[0m
^C
[34m  Stopping...[0m


In [21]:
import pickle

# Load model
with open("salary_predictor_model.pkl", "rb") as f:
    model = pickle.load(f)

# Try to access the training feature names
try:
    print("Expected Features:", model.feature_names_in_)
except AttributeError:
    print("This model does not store feature names.")


Expected Features: ['fnlwgt' 'age' 'capital-gain' 'hours-per-week'
 'marital-status_Married-civ-spouse']


In [25]:
import streamlit as st
import pickle
import numpy as np
import pandas as pd

# Load trained model
with open("salary_predictor_model.pkl", "rb") as f:
    model = pickle.load(f)

st.title("💼 Employee Salary Predictor")
st.write("Enter details to predict if income is >50K or <=50K")

# Inputs matching exactly the model's expected features
fnlwgt = st.number_input("fnlwgt (Census weight)", min_value=10000, max_value=1000000, value=200000)
age = st.slider("Age", 18, 90, 30)
capital_gain = st.number_input("Capital Gain", 0, 100000, 0)
hours_per_week = st.slider("Hours Worked per Week", 1, 80, 40)
married = st.selectbox("Marital Status", ["Married-civ-spouse", "Other"])

# One-hot encode manually (as per training)
married_status = 1 if married == "Married-civ-spouse" else 0

# Format input for prediction
input_df = pd.DataFrame([[fnlwgt, age, capital_gain, hours_per_week, married_status]],
                        columns=['fnlwgt', 'age', 'capital-gain', 'hours-per-week', 'marital-status_Married-civ-spouse'])

# Predict
if st.button("Predict Salary Class"):
    prediction = model.predict(input_df)[0]
    result = ">50K" if prediction == 1 else "<=50K"
    st.success(f"💰 Predicted Salary Class: {result}")




In [26]:
!streamlit run app.py


[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://192.168.0.108:8501[0m
[0m
^C
[34m  Stopping...[0m


In [28]:
pip freeze > requirements.txt


Note: you may need to restart the kernel to use updated packages.
