In [None]:
from sklearn.svm import SVC

import numpy as np

import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn import preprocessing

import warnings

warnings.filterwarnings("ignore")

df = pd.read_csv('income_evaluation.csv')

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

df.columns = df.columns.str.strip()

X_train, X_test, y_train, y_test = train_test_split(
    df.drop('income', axis=1),   # now correct
    df['income'],
    test_size=0.2,
    random_state=101
)

categorical_cols = X_train.select_dtypes(include=['object']).columns
numerical_cols = X_train.select_dtypes(exclude=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), numerical_cols)
    ]
)

svc = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(random_state=101))
])

accuracies = cross_val_score(svc, X_train, y_train, cv=5)
print("Train Score:", np.mean(accuracies))

svc.fit(X_train, y_train)

print("Test Score:", svc.score(X_test, y_test))


Train Score: 0.8558812980191896
Test Score: 0.8582834331337326


In [None]:
from sklearn.model_selection import GridSearchCV

grid = {
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__kernel': ["linear", "poly", "rbf", "sigmoid"],
    'classifier__degree': [1, 3, 5, 7],
    'classifier__gamma': [0.01, 1]
}

svm_cv = GridSearchCV(svc, grid, cv=5, n_jobs=-1)

svm_cv.fit(X_train, y_train)

print("Best Parameters:", svm_cv.best_params_)
print("Train Score (CV mean):", svm_cv.best_score_)
print("Test Score:", svm_cv.score(X_test, y_test))
