<a href="https://colab.research.google.com/github/Akash4523-babu/Employee-Salary-Prediction-Project/blob/main/Employee_Salary_Prediction_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# Step 0: Install necessary libraries (if not already installed in your Colab/Jupyter environment)
# !pip install pandas scikit-learn joblib matplotlib seaborn

import pandas as pd
import numpy as np # Import numpy for checking array types
import joblib
import matplotlib.pyplot as plt
import seaborn as sns # Good for EDA plots

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# --- Step 1: Load Dataset ---
try:
    data = pd.read_csv("Dataset.csv")
    print("Dataset loaded successfully.")
    print(data.head())
    print(f"Initial data shape: {data.shape}")
except FileNotFoundError:
    print("Error: Dataset.csv not found. Please ensure it's in the correct directory.")
    # Exit or handle the error appropriately
    exit()

# --- Step 2: Exploratory Data Analysis (EDA) and Cleaning ---

# Check for missing values (represented by '?' in this dataset)
print("\nMissing values (represented by '?'):")
for column in data.columns:
    if '?' in data[column].unique():
        print(f"'{column}': {data[column].value_counts().get('?', 0)} '?' values")

# Handle '?' values by replacing them with 'Others' or by dropping rows/columns as per previous steps
# Based on your notebook, you replaced '?' with 'Others' for specific columns
data['workclass'] = data['workclass'].replace('?', 'Others')
data['occupation'] = data['occupation'].replace('?', 'Others')
data['native-country'] = data['native-country'].replace('?', 'Others')

# Removing specific rows based on your previous notebook's EDA
# (These were rows with very few counts or considered outliers based on domain knowledge)
data = data[data['workclass'] != 'Without-pay']
data = data[data['workclass'] != 'Never-worked']
data = data[data['occupation'] != 'Armed-Forces']
data = data[data['education'] != 'Preschool']
data = data[data['education'] != '1st-4th']
data = data[data['education'] != '5th-6th']

# Outlier detection based on your notebook's box plots
# age: Filtered (age <= 75 and age >= 18)
data = data[(data['age'] <= 75) & (data['age'] >= 18)]
# educational-num: Filtered (educational-num <= 16 and educational-num >= 5)
data = data[(data['educational-num'] <= 16) & (data['educational-num'] >= 5)]
# (capital-gain and capital-loss were not filtered in the notebook, but boxplots were shown,
# suggesting they were left as is or handled by scaling)
# (hours-per-week was not filtered in the notebook, but boxplots were shown, suggesting it was left as is or handled by scaling)

print(f"\nData shape after cleaning: {data.shape}")

# --- Step 3: Define Features (X) and Target (y) ---
X = data.drop(columns=['income'])
y = data['income']

# --- Step 4: Preprocessing (Numerical and Categorical Feature Handling) ---

# Identify numerical and categorical columns from the original X
# Make sure these lists are accurate for your dataset's column names
numerical_features = [
    'age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week'
]
categorical_features = [
    'workclass', 'education', 'marital-status', 'occupation',
    'relationship', 'race', 'gender', 'native-country'
]

# Create a ColumnTransformer for preprocessing
# Numerical features will be scaled
# Categorical features will be One-Hot Encoded (from their string values)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough' # Keep any other columns not specified (e.g., if there were others)
)

# --- Step 5: Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Step 6: Apply Preprocessing to Training and Test Data ---
# Fit the preprocessor on X_train and transform both X_train and X_test
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# --- DIAGNOSTIC STEP (Crucial to verify preprocessing output) ---
print("\n--- Preprocessing Output Diagnostic ---")
print(f"Type of X_train_processed: {type(X_train_processed)}")
if hasattr(X_train_processed, 'toarray'): # Check if it's a sparse matrix
    print("X_train_processed is a sparse matrix. Converting to dense for dtype check.")
    dense_array = X_train_processed.toarray()
    print(f"dtype of X_train_processed (dense): {dense_array.dtype}")
    print(f"Shape of X_train_processed (dense): {dense_array.shape}")
    if not np.issubdtype(dense_array.dtype, np.number):
        print("WARNING: Data still contains non-numeric types after preprocessing!")
        print(dense_array[:2, :5]) # Print first 2 rows, first 5 columns for inspection
else: # It's likely already a dense array (e.g., if no OneHotEncoding resulted in sparse output)
    print(f"dtype of X_train_processed: {X_train_processed.dtype}")
    print(f"Shape of X_train_processed: {X_train_processed.shape}")
    if not np.issubdtype(X_train_processed.dtype, np.number):
        print("WARNING: Data still contains non-numeric types after preprocessing!")
        print(X_train_processed[:2, :5]) # Print first 2 rows, first 5 columns for inspection

print("--- End Preprocessing Output Diagnostic ---")

# --- Step 7: Train and Evaluate Models ---
# The models now receive the already processed (numerical) data
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000), # Increased max_iter for convergence
    "RandomForest": RandomForestClassifier(random_state=42),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42)
}

results = {}
best_model = None
best_accuracy = 0

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_processed, y_train)
    y_pred = model.predict(X_test_processed) # Predict on processed data
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))

    if acc > best_accuracy:
        best_accuracy = acc
        best_model = model
        best_model_name = name

print(f"\n✅ Best model: {best_model_name} with accuracy {best_accuracy:.4f}")

# --- Step 8: Save Best Model and Preprocessor ---
# Save the best trained model
joblib.dump(best_model, "best_model.pkl")
print("✅ Saved best model as best_model.pkl")

# Save the fitted preprocessor
joblib.dump(preprocessor, "preprocessing.pkl")
print("✅ Saved preprocessor as preprocessing.pkl")

print("\nModel training and saving complete. You can now use these .pkl files for deployment.")

Dataset loaded successfully.
   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
4   18          ?  103497  Some-college               10       Never-married   

          occupation relationship   race  gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4                  ?    Own-child  White  Female             0             0   

   hours-

In [8]:
%%writefile app.py
import streamlit as st
import pandas as pd
import joblib

# Set Streamlit page configuration
st.set_page_config(page_title="Employee Salary Classification", page_icon="💼", layout="centered")

# --- Load the trained model and preprocessor ---
try:
    model = joblib.load("best_model.pkl")
    preprocessor = joblib.load("preprocessing.pkl")
    st.success("Model and preprocessor loaded successfully!")
except FileNotFoundError:
    st.error("Error: 'best_model.pkl' or 'preprocessing.pkl' not found. Please ensure they are in the same directory as app.py.")
    st.stop() # Stop the app if essential files are missing
except Exception as e:
    st.error(f"An error occurred loading the model/preprocessor: {e}")
    st.stop()


st.title("💼 Employee Salary Classification App")
st.markdown("Predict whether an employee earns **>50K** or **≤50K** based on input features.")

# --- Sidebar inputs for user to provide details ---
st.sidebar.header("Input Employee Details")

# Define ranges and options based on your EDA and original dataset unique values
# Numerical inputs
age = st.sidebar.slider("Age", 18, 75, 30)
hours_per_week = st.sidebar.slider("Hours per week", 1, 99, 40)
capital_gain = st.sidebar.number_input("Capital Gain", min_value=0, max_value=99999, value=0)
capital_loss = st.sidebar.number_input("Capital Loss", min_value=0, max_value=4356, value=0) # Max based on your EDA
fnlwgt = st.sidebar.number_input("Final Weight (fnlwgt)", min_value=12285, max_value=1490400, value=190000) # Range based on EDA
educational_num = st.sidebar.slider("Educational Number", 5, 16, 9) # Range based on your EDA

# Categorical inputs (using selectbox for string values)
workclass = st.sidebar.selectbox("Workclass", [
    "Private", "Self-emp-not-inc", "Local-gov", "Others", "State-gov",
    "Self-emp-inc", "Federal-gov"
])
education = st.sidebar.selectbox("Education Level", [
    "Bachelors", "Masters", "Doctorate", "HS-grad", "Assoc-acdm", "Some-college",
    "11th", "10th", "7th-8th", "Prof-school", "9th", "12th", "Assoc-voc"
])
marital_status = st.sidebar.selectbox("Marital Status", [
    "Married-civ-spouse", "Never-married", "Divorced", "Separated",
    "Widowed", "Married-spouse-absent", "Married-AF-spouse"
])
occupation = st.sidebar.selectbox("Job Role", [
    "Prof-specialty", "Craft-repair", "Exec-managerial", "Adm-clerical", "Sales",
    "Other-service", "Machine-op-inspct", "Others", "Transport-moving",
    "Handlers-cleaners", "Farming-fishing", "Tech-support", "Protective-serv",
    "Priv-house-serv"
])
relationship = st.sidebar.selectbox("Relationship", [
    "Husband", "Not-in-family", "Own-child", "Unmarried", "Wife", "Other-relative"
])
race = st.sidebar.selectbox("Race", [
    "White", "Black", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other"
])
gender = st.sidebar.selectbox("Gender", ["Male", "Female"])
native_country = st.sidebar.selectbox("Native Country", [
    "United-States", "Mexico", "Others", "Philippines", "Germany", "Puerto-Rico",
    "Canada", "El-Salvador", "India", "Cuba", "England", "China", "South",
    "Jamaica", "Italy", "Dominican-Republic", "Japan", "Guatemala", "Poland",
    "Vietnam", "Columbia", "Haiti", "Portugal", "Taiwan", "Iran", "Nicaragua",
    "Greece", "Peru", "Ecuador", "France", "Ireland", "Thailand", "Hong",
    "Cambodia", "Trinadad&Tobago", "Laos", "Outlying-US(Guam-USVI-etc)",
    "Yugoslavia", "Scotland", "Honduras", "Hungary", "Holand-Netherlands"
])


# --- Build input DataFrame for single prediction ---
# IMPORTANT: The order of columns in this DataFrame MUST exactly match the original 'X' DataFrame
# used to train the preprocessor in the notebook.
# This order was: ['age', 'workclass', 'fnlwgt', 'education', 'educational-num', 'marital-status',
#                   'occupation', 'relationship', 'race', 'gender', 'capital-gain', 'capital-loss',
#                   'hours-per-week', 'native-country']
input_data = {
    'age': [age],
    'workclass': [workclass],
    'fnlwgt': [fnlwgt],
    'education': [education],
    'educational-num': [educational_num],
    'marital-status': [marital_status],
    'occupation': [occupation],
    'relationship': [relationship],
    'race': [race],
    'gender': [gender],
    'capital-gain': [capital_gain],
    'capital-loss': [capital_loss],
    'hours-per-week': [hours_per_week],
    'native-country': [native_country]
}

input_df = pd.DataFrame(input_data)

st.write("### 🔎 Input Data")
st.write(input_df)

# --- Predict button for single prediction ---
if st.button("Predict Salary Class"):
    try:
        # Preprocess the input data
        processed_input = preprocessor.transform(input_df)

        # Make prediction
        prediction = model.predict(processed_input)

        st.success(f"✅ Prediction: Employee Salary is {prediction[0]}")
    except Exception as e:
        st.error(f"An error occurred during prediction: {e}")
        st.info("Please check the input values and ensure the model is correctly loaded.")

# --- Batch prediction ---
st.markdown("---")
st.markdown("#### 📂 Batch Prediction")
uploaded_file = st.file_uploader("Upload a CSV file for batch prediction", type="csv")

if uploaded_file is not None:
    batch_data = pd.read_csv(uploaded_file)
    st.write("Uploaded data preview:", batch_data.head())

    # Ensure the batch_data columns match the training data columns exactly
    # You might want to add more robust error handling or data cleaning for batch uploads
    # e.g., checking for missing columns, handling '?' values, etc.

    try:
        # Preprocess the batch data
        processed_batch_data = preprocessor.transform(batch_data)

        # Make predictions
        batch_preds = model.predict(processed_batch_data)
        batch_data['PredictedClass'] = batch_preds

        st.write("✅ Predictions:")
        st.write(batch_data.head())

        # Provide download button for predictions
        csv = batch_data.to_csv(index=False).encode('utf-8')
        st.download_button(
            "Download Predictions CSV",
            csv,
            file_name='predicted_classes.csv',
            mime='text/csv'
        )
    except Exception as e:
        st.error(f"An error occurred during batch prediction: {e}")
        st.info("Please ensure the uploaded CSV has the correct columns and data format as the original dataset.")

Writing app.py


In [3]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.47.1-py3-none-any.whl.metadata (9.0 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.47.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m60.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m91.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInst