In [25]:
# Task 1: import dataset
from ucimlrepo import fetch_ucirepo 

# imports
import numpy as np
import pandas as pd

# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features

In [24]:
# Task 2: quick look at dataset
# head
print(X.head()) 
print("-----------------------------------------------------------------")

# info
print(X.info()) 
print("-----------------------------------------------------------------") 

# describe
print(X.describe())
print("-----------------------------------------------------------------") 

# shape
print("Shape: ", X.shape)

   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country  
0          2174             0              40  United-States  
1             0             0              13  United-St

In [129]:
# T3: missing values
# first replace the nan values with ???
# then replace the ? values with nan to get correct count
X = X.replace(np.nan,'???')
X = X.replace('?', np.nan)
X.isna().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
dtype: int64

In [130]:
# T4: Replace missing values
X = X.replace('???', np.nan)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       46043 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      46033 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  47985 non-null  object
dtypes: int64(6), object(8)
memory usage: 5.2+ MB


In [124]:
# T5: Create/apply preprocessing pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


num_cols = X.select_dtypes(include='number').columns.to_list()
cat_cols = X.select_dtypes(exclude='number').columns.to_list()

#num_cols.remove("G3")

num_pipeline = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
cat_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(sparse_output=False))
preprocessing = ColumnTransformer([('num', num_pipeline, num_cols),
                                   ('cat', cat_pipeline, cat_cols)], remainder='passthrough')

X_prepared = preprocessing.fit_transform(X)
feature_names=preprocessing.get_feature_names_out()
X_prepared = pd.DataFrame(data=X_prepared, columns=feature_names)
X_prepared

preprocessing

In [128]:
# Task 5 contd: show the prepared shape
X_prepared.shape

(48842, 108)

In [106]:
# Task 6: check the target
#### y contains the target values for income, not X
y['income'].value_counts()

<=50K     24720
<=50K.    12435
>50K       7841
>50K.      3846
Name: income, dtype: int64

In [8]:
# Task 7: Clean up the target data

### remove the period from <=50K.
y['income'] = y['income'].replace('<=50K.', '<=50K')
### remoce the period from >50K.
y['income'] = y['income'].replace('>50K.', '>50K')
### output the new correct value counts
print("income")
y['income'].value_counts()

income
<=50K    37155
>50K     11687
Name: income, dtype: int64

In [134]:
# Task 8: split the data 80/20
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(39073, 14) (39073, 108) (9769, 14) (9769, 108)


In [7]:
# Task 9: Train svm model

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model_svm = SVC(kernel='poly', C=0.1, gamma=1)
model_svm.fit(X_train.iloc, y_train.iloc.values_ravel())