In [None]:
!pip install pandas numpy matplotlib seaborn scikit-learn streamlit



Collecting streamlit
  Downloading streamlit-1.47.0-py3-none-any.whl.metadata (9.0 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.47.0-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInst

In [None]:
# Step 1: Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

print("✅ Step 1: Libraries imported.")

# Step 2: Load Dataset
url = "/content/adult.csv"
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
                'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
                'hours_per_week', 'native_country', 'income']

df = pd.read_csv(url, names=column_names, na_values=' ?', skipinitialspace=True)
print("✅ Step 2: Dataset loaded. Shape:", df.shape)

# Step 3: Data Cleaning
print("\nMissing values before dropping:\n", df.isnull().sum())
df_clean = df.dropna()
print("\n✅ Step 3: Dropped missing values. New shape:", df_clean.shape)

# Step 4: Encode categorical columns
label_encoders = {}
for col in df_clean.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col])
    label_encoders[col] = le

print("✅ Step 4: Encoded categorical features.")

# Step 5: Split Features and Target
X = df_clean.drop('income', axis=1)
y = df_clean['income']
print("✅ Step 5: Features and Target variable separated.")

# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"✅ Step 6: Train-Test Split done. Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# Step 7: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("✅ Step 7: Feature Scaling applied.")

# Step 8: Train KNN Classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
print("✅ Step 8: KNN model trained.")

# Step 9: Evaluation
y_pred = knn.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"✅ Step 9: Evaluation complete.\nAccuracy: {accuracy:.4f}\n\nClassification Report:\n{report}")


✅ Step 1: Libraries imported.
✅ Step 2: Dataset loaded. Shape: (32561, 15)

Missing values before dropping:
 age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

✅ Step 3: Dropped missing values. New shape: (32561, 15)
✅ Step 4: Encoded categorical features.
✅ Step 5: Features and Target variable separated.
✅ Step 6: Train-Test Split done. Train shape: (26048, 14), Test shape: (6513, 14)
✅ Step 7: Feature Scaling applied.
✅ Step 8: KNN model trained.
✅ Step 9: Evaluation complete.
Accuracy: 0.8353

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.91      0.89      4942
           1       0.68      0.60      0.64      1571

    accuracy                           0.84     