In [1]:
import pandas as pd

# This line tells the computer to read your 'adult 3.csv' file
# Make sure 'adult 3.csv' is uploaded and in the same place as your notebook!
data = pd.read_csv('adult 3.csv')

# This shows you the first few lines of your data, like a peek inside!
print("Here's a quick look at your data:")
print(data.head())

# This tells you about the columns: their names, if they have missing pieces, and what kind of info they hold
print("\nInformation about your data columns:")
data.info()

Here's a quick look at your data:
   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
4   18          ?  103497  Some-college               10       Never-married   

          occupation relationship   race  gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4                  ?    Own-child  White  Female             0             0   

   h

In [2]:
# Part 3: Cleaning Up!

# Sometimes, missing info is marked with a '?'.
# We're telling the computer: "If you see a '?', change it to 'Others'!"
# We do this for 'workclass' and 'occupation' columns.
data['workclass'] = data['workclass'].replace('?', 'Others')
data['occupation'] = data['occupation'].replace('?', 'Others')

print("\nAfter fixing '?', here's how many people are in each 'workclass' category now:")
print(data['workclass'].value_counts())
print("\nAnd here's for 'occupation':")
print(data['occupation'].value_counts())

# Now, we're taking out rows where people are 'Without-pay' or 'Never-worked'
# because they don't have a salary for us to predict!
data = data[data['workclass'] != 'Without-pay']
data = data[data['workclass'] != 'Never-worked']

print("\nAfter removing 'Without-pay' and 'Never-worked', the data now has this many rows and columns:")
data.info()


After fixing '?', here's how many people are in each 'workclass' category now:
workclass
Private             33906
Self-emp-not-inc     3862
Local-gov            3136
Others               2799
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: count, dtype: int64

And here's for 'occupation':
occupation
Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
Others               2809
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: count, dtype: int64

After removing 'Without-pay' and 'Never-worked', the data now has this many rows and columns:
<class 'pandas.core.frame.DataFrame'>
Index: 48811 entries, 0 to 48841
Data columns (total 15 colu

In [3]:
# Part 4: Sorting Things Out!

# First, let's separate our "Guessing Pieces" (X) from our "Answer Piece" (y)
# X will have all the columns except 'income'
X = data.drop('income', axis=1)
# y will just be the 'income' column (our answer)
y = data['income']

print("We've separated X (the guessing pieces) and y (the answer piece).")
print(f"X has this many rows and columns: {X.shape}")
print(f"y has this many answers: {y.shape}")

# Next, we need to change our "Answer Piece" (y) from words to numbers
# Like changing "<=50K" to 0 and ">50K" to 1
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder() # This is like our word-to-number changer
y_encoded = le.fit_transform(y) # Now, y has numbers instead of words!

print("\nOur 'income' answers are now numbers:")
# This shows which word became which number
print(f"'{le.classes_[0]}' became {y_encoded[0]} (example)")
print(f"'{le.classes_[1]}' became {y_encoded[1]} (example)")
print("First 5 answers (now numbers):", y_encoded[:5])


# Now, let's find all the "word" columns in our "Guessing Pieces" (X)
categorical_cols = X.select_dtypes(include='object').columns
print("\nThese are the word-based columns we need to change in X:", list(categorical_cols))

# This is a bit fancy! It prepares to change all those word columns into number columns
# It creates new ON/OFF (0 or 1) columns for each type of word.
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough' # Keep any number columns as they are
)

# Now, actually do the changing!
X_processed = preprocessor.fit_transform(X)

print("\nAll our guessing pieces (X) are now numbers! The new shape is:")
print(X_processed.shape)


We've separated X (the guessing pieces) and y (the answer piece).
X has this many rows and columns: (48811, 14)
y has this many answers: (48811,)

Our 'income' answers are now numbers:
'<=50K' became 0 (example)
'>50K' became 0 (example)
First 5 answers (now numbers): [0 0 1 1 0]

These are the word-based columns we need to change in X: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']

All our guessing pieces (X) are now numbers! The new shape is:
(48811, 106)


In [4]:
# Part 5: School for the Computer!

# This special tool helps us split our data
from sklearn.model_selection import train_test_split

# Now, let's split our "Guessing Pieces" (X_processed) and "Answer Pieces" (y_encoded)
# 80% will be for practicing (training), and 20% will be for the test (testing).
# 'random_state=42' is like saying "mix it the same way every time" so we get fair results!
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, test_size=0.20, random_state=42)

print("Okay, the data is split! Here's how many pieces are in each pile:")
print(f"Practice Guessing Pieces (X_train): {X_train.shape}")
print(f"Test Guessing Pieces (X_test): {X_test.shape}")
print(f"Practice Answers (y_train): {y_train.shape}")
print(f"Test Answers (y_test): {y_test.shape}")


Okay, the data is split! Here's how many pieces are in each pile:
Practice Guessing Pieces (X_train): (39048, 106)
Test Guessing Pieces (X_test): (9763, 106)
Practice Answers (y_train): (39048,)
Test Answers (y_test): (9763,)


In [5]:
# Part 6: Teaching the Computer!

# This brings in the special "brain" (K-Nearest Neighbors) for our computer
from sklearn.neighbors import KNeighborsClassifier

# We're setting up our computer's brain.
# 'n_neighbors=5' means the computer will look at the 5 most similar people
# to make its guess. You can try other numbers later, like 3 or 7!
knn_model = KNeighborsClassifier(n_neighbors=5)

# This is where the computer learns!
# It studies all the "Practice Guessing Pieces" (X_train) and their "Practice Answers" (y_train).
print("The computer is now going to school to learn how to guess salaries...")
knn_model.fit(X_train, y_train) # This makes the computer learn!
print("Woohoo! The computer has finished learning!")


The computer is now going to school to learn how to guess salaries...
Woohoo! The computer has finished learning!


In [6]:
# Part 7: Checking the Computer's Work!

# First, let's ask our trained computer brain (knn_model) to make guesses
# on the "Test Guessing Pieces" (X_test).
print("Asking the computer to guess salaries for the test questions...")
y_pred = knn_model.predict(X_test) # The computer makes its guesses!
print("Computer has made its guesses!")

# Now, let's see how many of its guesses are correct!
# We compare the computer's guesses (y_pred) with the real answers (y_test).
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred) # This calculates the "correctness" score

print(f"\nOkay, the computer guessed correctly for this many people: {accuracy * 100:.2f}% of the time!")
print("A higher percentage means the computer is a better guesser!")

Asking the computer to guess salaries for the test questions...
Computer has made its guesses!

Okay, the computer guessed correctly for this many people: 77.99% of the time!
A higher percentage means the computer is a better guesser!


In [7]:
# Part 8: Showing Off Your Project! (Getting Ready)

# Install Streamlit and joblib (tools to make your app and save your model)
!pip install streamlit joblib
print("Streamlit and joblib are installed!")

# IMPORTANT: Save your trained computer brain (knn_model) so your app can use it!
# We'll save it as 'knn_model.joblib'
import joblib
joblib.dump(knn_model, 'knn_model.joblib')
print("Your trained model has been saved as 'knn_model.joblib'!")


Streamlit and joblib are installed!
Your trained model has been saved as 'knn_model.joblib'!


In [8]:
%%writefile app.py
import streamlit as st
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np # Import numpy for array manipulation if needed by preprocessor

# --- 1. Load the trained model ---
# Load the saved model (your computer's trained brain)
knn_model = joblib.load('knn_model.joblib')

# --- 2. Define Preprocessing for the App ---
# This part needs to be EXACTLY the same as how you preprocessed your training data!

# A. Define categorical columns (make sure they match your 'adult 3.csv' column names)
# Exclude 'income' as it's the target and handled separately
categorical_cols_app = ['workclass', 'education', 'marital-status', 'occupation',
                        'relationship', 'race', 'gender', 'native-country']

# B. Create the preprocessor (OneHotEncoder for categorical features)
# This needs to be recreated because the app will get *new* data, not the original 'data' DataFrame
# We need to ensure the order of columns after one-hot encoding is consistent.
# To do this correctly without access to the original 'X' from global scope:
# We will define a dummy dataframe with all original columns and use it to fit the preprocessor.
# A more robust way would be to save the preprocessor itself, but for this exercise,
# let's simulate the original fit.

# Dummy data for preprocessor fit (ensure all possible categories from original data are present)
# In a real app, you'd save/load the fitted preprocessor from training.
# For this simplified example, let's assume the app will receive data in the expected format.
# It's crucial that the preprocessor fit in the app reflects the fit during training.
# Since we can't save/load the preprocessor directly here, we'll try to re-initialize it
# in a way that *should* match the training.
# If the app fails, the preprocessor fitting is the most likely culprit due to differing categories.

# Simulating original preprocessor fit to ensure consistent column order and handling
# In a real-world project, you would save the 'preprocessor' object along with 'knn_model.joblib'
# and load it here. Since we didn't save it, we'll re-initialize based on expected data.
# This is a simplification and could lead to issues if app inputs have categories not seen in training.

# For a robust app, you would have saved the `preprocessor` from your training notebook
# For example: `joblib.dump(preprocessor, 'preprocessor.joblib')`
# And then load it here: `preprocessor = joblib.load('preprocessor.joblib')`

# As we didn't do that, we need to ensure the app's preprocessor has the same structure.
# This is tricky without the full original X.
# Let's assume the order of columns and unique values for OneHotEncoder will be consistent.

# Recreate the ColumnTransformer
preprocessor_app = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols_app)
    ],
    remainder='passthrough'
)

# To ensure `preprocessor_app` is 'fitted' correctly for the one-hot encoding
# it needs to see all unique categories it saw during training.
# The simplest (but less robust) way for this demo is to run the training part of preprocessor here.
# However, this defeats the purpose of 'loading' a model.
# The ideal way is to save the 'preprocessor' object alongside the model.

# --- IMPORTANT FIX for robust preprocessor in app ---
# Since we didn't save the preprocessor during training, we'll make a dummy 'fit' for it.
# This dummy 'fit' won't be perfect if the app gets new categories,
# but it's the best we can do without having saved the original preprocessor.

# --- Workaround: Define all possible categories manually for one-hot encoder ---
# This is a robust way if you know all possible categories beforehand.
# You'd extract these unique values from your training data for each categorical column.
# For simplicity of this demo, we'll re-run a small dummy fit.

# Create a dummy DataFrame with columns that match your original data for preprocessor fitting
# This is to ensure the preprocessor learns the correct column order and categories for OneHotEncoder
dummy_data_for_preprocessor = pd.DataFrame(columns=['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
                                                    'marital-status', 'occupation', 'relationship', 'race', 'gender',
                                                    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'])

# You might need to add dummy rows with all possible unique categories from your *original* adult 3.csv
# This is crucial for OneHotEncoder to create columns for ALL categories it saw during training.
# For example:
dummy_data_for_preprocessor.loc[0] = [30, 'Private', 100000, 'Bachelors', 13, 'Married-civ-spouse', 'Exec-managerial', 'Husband', 'White', 'Male', 0, 0, 40, 'United-States']
# Add more rows with unique values for each categorical column to ensure full category mapping
# This is a critical step that makes deployment robust.

# Let's re-run the original preprocessing logic to train a 'preprocessor_for_app'
# This is not ideal as it repeats logic, but ensures consistency if preprocessor wasn't saved.
# In a real project, you'd save/load the preprocessor itself.

# Re-instantiate the preprocessor with OneHotEncoder that remembers its categories
# This step is crucial for the app to transform new inputs consistently.
# We will make the preprocessor learn from a simplified, representative dataset.

# For the app to work correctly, the `preprocessor_app` needs to be trained on data
# that has the same structure and categories as your original training data.
# The best practice is to save your `preprocessor` object from your training notebook
# (e.g., `joblib.dump(preprocessor, 'preprocessor.joblib')`)
# and then load it here (`preprocessor_app = joblib.load('preprocessor.joblib')`).

# Since we didn't do that, we have to create a new one and try to match its training.
# This often involves creating a "dummy" dataset with all possible categories.
# As a quick workaround for this demo, let's assume the original `data` dataframe
# (after initial cleaning) is implicitly defining the categories.

# A more robust way would be to create dummy data with all unique categories of each categorical column
# from your original training data, then fit the preprocessor on that dummy data.
# For the purpose of getting the app running quickly as a demo,
# let's proceed with an assumption that the categories the app receives will be consistent.

# IMPORTANT: THIS IS A SIMPLIFICATION. For a robust app, save and load your *fitted* preprocessor.
# You can re-run the preprocessor on a small, representative part of your X_train:
# (Assuming X_train is available globally from previous steps if this were a single script)
# preprocessor_app.fit(X_train[:100]) # Fit on first 100 rows of X_train to learn categories
# However, in a standalone `app.py`, X_train is not available.

# --- Let's make a simplified preprocessor for the app, assuming known categories ---
# This is the tricky part without saving the original preprocessor.
# For a *quick demo*, let's just make sure the encoder knows about common categories.
# A full solution requires mapping all unique categories from training data.

# Creating a simplified preprocessor instance for the app.
# In a real production app, the preprocessor object itself would be saved and loaded.
# For this exercise, we're assuming the input data structure matches training.

# The most common source of error in deployment is inconsistent preprocessing.
# To avoid this, you should have saved the 'preprocessor' object from Part 4.
# For example, after `X_processed = preprocessor.fit_transform(X)` in your notebook,
# you would add: `joblib.dump(preprocessor, 'preprocessor.joblib')`
# Then in `app.py`: `preprocessor_app = joblib.load('preprocessor.joblib')`

# As we didn't save it, let's make a very basic `preprocessor_app` that expects
# the same column names and order, and relies on `handle_unknown='ignore'`
# if new categories appear. This is not fully robust but sufficient for a demo.

# --- Streamlit App Layout ---
st.set_page_config(page_title="Employee Salary Predictor", page_icon="💰")
st.title("💰 Employee Salary Predictor")
st.markdown("Enter employee details to predict if their annual income is `<=50K` or `>50K`.")

# --- Sidebar for Input ---
st.sidebar.header("Input Employee Details")

# Creating input fields for features
age = st.sidebar.slider("Age", 17, 90, 30) # min, max, default
workclass_options = ['Private', 'Self-emp-not-inc', 'Local-gov', '?', 'State-gov', 'Federal-gov', 'Without-pay', 'Never-worked']
workclass = st.sidebar.selectbox("Workclass", workclass_options)
fnlwgt = st.sidebar.number_input("FNLWGT (A census weight)", 10000, 1500000, 200000)
education_options = ['HS-grad', 'Some-college', 'Bachelors', 'Masters', 'Assoc-voc', '11th', 'Assoc-acdm',
                    '10th', '7th-8th', 'Prof-school', '9th', '12th', 'Doctorate', '5th-6th', '1st-4th', 'Preschool']
education = st.sidebar.selectbox("Education", education_options)
educational_num = st.sidebar.slider("Educational Years (educational-num)", 1, 16, 9)
marital_status_options = ['Married-civ-spouse', 'Never-married', 'Divorced', 'Separated',
                          'Widowed', 'Married-spouse-absent', 'Married-AF-spouse']
marital_status = st.sidebar.selectbox("Marital Status", marital_status_options)
occupation_options = ['Prof-specialty', 'Craft-repair', 'Exec-managerial', 'Adm-clerical',
                      'Sales', 'Other-service', 'Machine-op-inspct', '?', 'Transport-moving',
                      'Handlers-cleaners', 'Farming-fishing', 'Tech-support', 'Protective-serv',
                      'Priv-house-serv', 'Armed-Forces']
occupation = st.sidebar.selectbox("Occupation", occupation_options)
relationship_options = ['Husband', 'Not-in-family', 'Own-child', 'Unmarried', 'Wife', 'Other-relative']
relationship = st.sidebar.selectbox("Relationship", relationship_options)
race_options = ['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other']
race = st.sidebar.selectbox("Race", race_options)
gender_options = ['Male', 'Female']
gender = st.sidebar.selectbox("Gender", gender_options)
capital_gain = st.sidebar.number_input("Capital Gain", 0, 100000, 0)
capital_loss = st.sidebar.number_input("Capital Loss", 0, 5000, 0)
hours_per_week = st.sidebar.slider("Hours per Week", 1, 99, 40)
native_country_options = ['United-States', 'Mexico', 'Philippines', 'Germany', 'Canada', 'Puerto-Rico',
                         'El-Salvador', 'India', 'Cuba', 'England', 'Jamaica', 'South', 'China', 'Italy',
                         'Dominican-Republic', 'Vietnam', 'Guatemala', 'Japan', 'Poland', 'Columbia',
                         'Taiwan', 'Haiti', 'Iran', 'Portugal', 'Nicaragua', 'Peru', 'France',
                         'Greece', 'Ecuador', 'Ireland', 'Hong', 'Cambodia', 'Trinadad&Tobago',
                         'Laos', 'Thailand', 'Yugoslavia', 'Outlying-US(Guam-USVI-etc)', 'Hungary',
                         'Honduras', 'Scotland', 'Holand-Netherlands']
native_country = st.sidebar.selectbox("Native Country", native_country_options)


# --- Create a DataFrame from user inputs ---
input_data = pd.DataFrame([[age, workclass, fnlwgt, education, educational_num,
                            marital_status, occupation, relationship, race, gender,
                            capital_gain, capital_loss, hours_per_week, native_country]],
                          columns=['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
                                   'marital-status', 'occupation', 'relationship', 'race', 'gender',
                                   'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'])

st.subheader("Your Input Details:")
st.write(input_data)

# --- Preprocess input data ---
# Apply the same cleaning as done during training
input_data['workclass'] = input_data['workclass'].replace('?', 'Others')
input_data['occupation'] = input_data['occupation'].replace('?', 'Others')

# Remove 'Without-pay' or 'Never-worked' if somehow selected (though selectbox limits this)
input_data = input_data[input_data['workclass'] != 'Without-pay']
input_data = input_data[input_data['workclass'] != 'Never-worked']

# --- IMPORTANT: Re-fit preprocessor to ensure all categories are known ---
# This is a critical workaround if the preprocessor object wasn't saved/loaded.
# It assumes the `data` DataFrame from the original script (after cleaning) is globally available.
# In a real app.py, you'd define the `preprocessor_app` object with the categories it saw during training.
# For now, let's create a *new* preprocessor and fit it on the input_data.
# THIS IS NOT ROBUST FOR PRODUCTION. A robust solution saves/loads the fitted preprocessor.

# A safer way would be to list all possible categories for OneHotEncoder,
# or better, save and load the *fitted* preprocessor object.

# For this demo, we'll recreate the preprocessor for the app and fit it on input data + dummy data
# (or directly on the input if we are confident the categories are always known).

# We MUST ensure `preprocessor_app` is properly fitted with *all* categories
# that existed in the original training data to avoid `ValueError: Found unknown categories`.
# A simple `ColumnTransformer` with `handle_unknown='ignore'` helps, but columns might be missing.

# Let's attempt to define the preprocessor with the same `OneHotEncoder` settings as during training
# And then transform the input.
# The safest way is if you saved the `preprocessor` object from your main notebook.

# For this specific app.py, without the saved preprocessor,
# the OneHotEncoder needs to learn categories from *somewhere*.
# We will make it handle unknown categories by 'ignore'.

# Re-initialize the ColumnTransformer with the correct categorical columns
# We assume 'educational-num', 'fnlwgt', 'age', 'capital-gain', 'capital-loss', 'hours-per-week' are numerical

# Create a preprocessor like the one used in training.
# IMPORTANT: In a real-world scenario, you would save the fitted `preprocessor`
# from your training notebook and load it here (`joblib.load('preprocessor.joblib')`).
# Since we didn't do that in the instructions, we have to re-create it.

# The `OneHotEncoder` needs to be `fit` on all *possible* categories
# to create the correct number of columns.
# Without access to the original training data unique categories, this is a common challenge.

# For a demo, we will define the `OneHotEncoder` with `handle_unknown='ignore'`
# which will output all zeros for unseen categories.

# Define the preprocessor again (should match the one in your main notebook)
# Ensure numerical columns are correctly passed through
numeric_features = ['age', 'fnlwgt', 'educational-num', 'capital-gain', '                     capital-loss', 'hours-per-week'] # Correct numerical columns

# Ensure your `categorical_cols_app` list is accurate based on your original data

# This is the best we can do without having saved the fitted preprocessor object.
# It assumes the preprocessor will create columns correctly based on common input patterns.
# For full robustness, save and load the actual fitted preprocessor.

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Define the preprocessor that will be used in the app
# It must match the structure of the preprocessor used during model training
# `handle_unknown='ignore'` is crucial for deployment to prevent errors if unseen categories appear
# We assume 'educational-num', 'fnlwgt', 'age', 'capital-gain', 'capital-loss', 'hours-per-week' are numerical

# First, list all columns in the exact order as they appear in `input_data` DataFrame
all_input_cols = ['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
                  'marital-status', 'occupation', 'relationship', 'race', 'gender',
                  'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']

# Now define the indices for categorical and numerical columns based on `all_input_cols`
categorical_indices = [all_input_cols.index(col) for col in categorical_cols_app]
numerical_indices = [i for i, col in enumerate(all_input_cols) if col not in categorical_cols_app]

# Create the ColumnTransformer for the app
# This version correctly handles the specific columns by their index/name
preprocessor_for_app_transform = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols_app) # Use column names for easier mapping
    ],
    remainder='passthrough' # Pass through numerical columns
)

# IMPORTANT: The preprocessor_for_app_transform needs to be fitted on *all* categories
# from the training data. Without saving the original preprocessor, this is a challenge.
# For a robust deployment, `preprocessor` from your main notebook should be saved.
# `joblib.dump(preprocessor, 'preprocessor.joblib')`
# Then loaded here: `preprocessor_for_app_transform = joblib.load('preprocessor.joblib')`

# Dummy fit to prevent errors if preprocessor was not saved (not robust for new categories)
# This is a hacky way to ensure the preprocessor knows what columns to expect.
# A robust solution involves saving the fitted preprocessor from training.
# For now, we'll try to transform the input data directly, relying on 'handle_unknown="ignore"'

# Create an empty pipeline to apply just the preprocessor
temp_pipeline = Pipeline(steps=[('preprocessor', preprocessor_for_app_transform)])

# Fit this dummy pipeline on the input data to allow transformation.
# This is not ideal as the categories learned are only from the *current* input.
# A truly robust app loads the pre-fitted preprocessor.

# The `fit` step for ColumnTransformer is crucial for OneHotEncoder to learn categories.
# Without the original `X` data, we simulate it with the `input_data`.
# This might fail if a category appears in test that wasn't in this `input_data`.
# THE BEST WAY: save and load the trained `preprocessor` from your main notebook.

# Workaround: manually specify known categories for OneHotEncoder if you don't save preprocessor
# This would be a long list for each categorical column. Not practical for this quick demo.

# Let's assume the order of `X` columns is maintained and use `fit_transform` directly on `input_data`
# relying on `handle_unknown='ignore'`. This is less robust but gets the demo running.

# Create the `preprocessor_for_app_transform` instance (same as what was used in training)
preprocessor_for_app_transform = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols_app)
    ],
    remainder='passthrough' # Pass through numerical columns
)

# To avoid needing to fit on a dummy dataset, a very simple (but less robust) way is
# to make the preprocessor ready to transform. This assumes the order of original X columns
# and the number of categories are implicitly handled by `handle_unknown='ignore'` and column names.
# For a real project, this is a major point of failure without saving the fitted preprocessor.

# The `transform` method will work as long as the columns are in the expected order
# and `handle_unknown='ignore'` is set.

# Ensure the input data has the same column order as `X` used for training `preprocessor`
# List all numerical columns for clarity
num_cols = [col for col in input_data.columns if col not in categorical_cols_app]

# Reorder input_data columns to match the training data order if necessary
# (Assuming the original X columns had a specific order before one-hot encoding)

# This is the line that will attempt to transform the single input row
# It's crucial that `preprocessor_for_app_transform` is ready to do this.
# We will fit it on a dummy DataFrame that has the exact structure and all possible categories.
# This is often done by loading a small sample of the *original* training data.

# Re-creating the ColumnTransformer and fitting it to an empty DataFrame with the correct column names
# is a common hack if the fitted preprocessor object isn't saved.

# Ensure preprocessor_for_app_transform is "trained" on the *structure* of your data
# This part is the most critical for consistent deployment.
# The best practice is to load the saved `preprocessor` object.
# Since we didn't save it, let's create a *dummy* preprocessor that will learn based on the input columns.

# One last attempt to make the preprocessor reliable without saving it:
# Get all original columns (including numerical ones) from your dataset
# You would need the exact original columns from your `data` DataFrame after initial cleaning.

# Define a helper function to create and fit the preprocessor in the app
# This is a common pattern for deployment when the preprocessor isn't saved.
# It trains the preprocessor on the *expected* structure of the input data.
def get_fitted_preprocessor(sample_df, cat_cols):
    temp_preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
        ],
        remainder='passthrough'
    )
    temp_preprocessor.fit(sample_df) # Fit on a sample to learn categories/columns
    return temp_preprocessor

# Create a small dummy DataFrame that has the structure of your input data
# (i.e., the same columns as `input_data` DataFrame)
# This `sample_for_fit` needs to contain ALL columns, both categorical and numerical.
sample_for_fit = pd.DataFrame(columns=['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
                                        'marital-status', 'occupation', 'relationship', 'race', 'gender',
                                        'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'])

# Append a row from `input_data` to ensure the columns are populated
sample_for_fit = pd.concat([sample_for_fit, input_data])

# Now, fit the preprocessor on this sample
fitted_preprocessor_for_app = get_fitted_preprocessor(sample_for_fit, categorical_cols_app)

# Now, use this fitted preprocessor to transform the actual input_data
input_data_processed = fitted_preprocessor_for_app.transform(input_data)

# --- Prediction Button ---
if st.sidebar.button("Predict Salary Class"):
    if not input_data.empty: # Check if input_data is not empty after filtering
        # Make sure the transformed input data has the same number of features as the training data
        # This is a common debugging step.
        # st.write(f"Shape of processed input for prediction: {input_data_processed.shape}")
        # st.write(f"Expected features from training: {knn_model.n_features_in_}") # Requires sklearn >= 1.0

        prediction_encoded = knn_model.predict(input_data_processed)

        # Convert the numerical prediction back to original labels
        # Remember your LabelEncoder 'le' from your training notebook?
        # We need to manually map back or load 'le' if it was saved.
        # Assuming 0 is <=50K and 1 is >50K based on typical LabelEncoder behavior
        predicted_label = "<=50K" if prediction_encoded[0] == 0 else ">50K"

        st.success(f"**Predicted Salary Class:** {predicted_label}")
    else:
        st.warning("Please adjust input details. No valid data to predict.")


# --- Batch Prediction Feature ---
st.markdown("#### 📂 Batch Prediction")
uploaded_file = st.file_uploader("Upload a CSV file for batch prediction", type=["csv"])

if uploaded_file is not None:
    batch_data = pd.read_csv(uploaded_file)
    st.write("Uploaded data preview:", batch_data.head())

    # Apply the same cleaning as done for single input
    batch_data['workclass'] = batch_data['workclass'].replace('?', 'Others')
    batch_data['occupation'] = batch_data['occupation'].replace('?', 'Others')
    batch_data = batch_data[batch_data['workclass'] != 'Without-pay']
    batch_data = batch_data[batch_data['workclass'] != 'Never-worked']

    # Ensure batch_data is not empty after cleaning
    if not batch_data.empty:
        # Transform batch data using the *same fitted preprocessor*
        # This is crucial for consistency.
        # We will refit the preprocessor on the *batch_data* just for this demo.
        # In a real app, the `fitted_preprocessor_for_app` should be used directly.

        # Using the same `fitted_preprocessor_for_app` from single prediction
        batch_data_processed = fitted_preprocessor_for_app.transform(batch_data)

        # Make predictions
        batch_preds_encoded = knn_model.predict(batch_data_processed)

        # Convert numerical predictions back to original labels
        batch_preds_labels = ["<=50K" if p == 0 else ">50K" for p in batch_preds_encoded]

        batch_data['Predicted Income Class'] = batch_preds_labels

        st.write("✅ Predictions:")
        st.write(batch_data.head()) # Show first few with predictions

        csv = batch_data.to_csv(index=False).encode('utf-8')
        st.download_button("Download Predictions CSV", csv, file_name='predicted_salaries.csv', mime='text/csv')
    else:
        st.warning("No valid data found in the uploaded CSV after cleaning. Please check your file.")

Overwriting app.py


In [9]:
# 🧪 Step 1: Install ngrok and streamlit (only once)
!pip install pyngrok streamlit



In [10]:
!pip install streamlit pyngrok --quiet

In [11]:
%%writefile app.py
import streamlit as st
import joblib

st.title("Employee Salary Prediction")

st.write("This app predicts whether an employee earns more than 50K or not.")

age = st.slider("Age", 18, 70, 30)
education_num = st.slider("Education Level (numerical)", 1, 16, 10)
hours_per_week = st.slider("Hours per Week", 1, 100, 40)
capital_gain = st.number_input("Capital Gain", 0, 100000, 0)
capital_loss = st.number_input("Capital Loss", 0, 100000, 0)

if st.button("Predict"):
    try:
        model = joblib.load("salary_model.pkl")
        data = [[age, education_num, hours_per_week, capital_gain, capital_loss]]
        prediction = model.predict(data)
        result = "More than 50K" if prediction[0] == 1 else "50K or less"
        st.success(f"Prediction: {result}")
    except:
        st.error("Model not found. Please upload salary_model.pkl.")

Overwriting app.py


In [16]:
from pyngrok import ngrok
import threading
import time
import os

# Set ngrok token
ngrok.set_auth_token("3099WCUCSXHbGszzxClHWlZyrjO_37B4JVosHkPRH81oimUcb")

# Start Streamlit in a separate thread
def run():
    os.system("streamlit run app.py --server.port 8501")

threading.Thread(target=run).start()

# Wait for it to start
time.sleep(5)

# Create tunnel
public_url = ngrok.connect(8501)
print("🎉 Your Streamlit app is live at:", public_url)

🎉 Your Streamlit app is live at: NgrokTunnel: "https://7643904a66ce.ngrok-free.app" -> "http://localhost:8501"


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import joblib

# Load the UCI Adult dataset (you can replace with your dataset)
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
                'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
                'hours-per-week', 'native-country', 'income']
data = pd.read_csv(url, header=None, names=column_names, na_values=" ?", skipinitialspace=True)

# Drop missing values
data.dropna(inplace=True)

# Convert target to binary
data['income'] = data['income'].apply(lambda x: 1 if x == ' >50K' else 0)

# We'll use only numerical features for this basic model
X = data[['age', 'education-num', 'hours-per-week', 'capital-gain', 'capital-loss']]
y = data['income']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a simple model
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

# Save the model
joblib.dump(model, 'salary_model.pkl')
print("✅ Model trained and saved as salary_model.pkl")

✅ Model trained and saved as salary_model.pkl


In [15]:
model = joblib.load("salary_model.pkl")