In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer, OneHotEncoder 
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import  GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
np.random.seed(0)


In [2]:
df=pd.read_csv('income_evaluation.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1    workclass       32561 non-null  object
 2    fnlwgt          32561 non-null  int64 
 3    education       32561 non-null  object
 4    education-num   32561 non-null  int64 
 5    marital-status  32561 non-null  object
 6    occupation      32561 non-null  object
 7    relationship    32561 non-null  object
 8    race            32561 non-null  object
 9    sex             32561 non-null  object
 10   capital-gain    32561 non-null  int64 
 11   capital-loss    32561 non-null  int64 
 12   hours-per-week  32561 non-null  int64 
 13   native-country  32561 non-null  object
 14   income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [4]:
df.isnull().sum()

age                0
 workclass         0
 fnlwgt            0
 education         0
 education-num     0
 marital-status    0
 occupation        0
 relationship      0
 race              0
 sex               0
 capital-gain      0
 capital-loss      0
 hours-per-week    0
 native-country    0
 income            0
dtype: int64

In [5]:
df=df.replace(' ?', np.nan).dropna()
df.rename(columns={' workclass':'workclass',
 ' fnlwgt': 'fnlwgt',' education':'education',
  ' education-num':'education-num',
  ' marital-status':'marital-status',
  ' occupation':  'occupation', 
  ' relationship':'relationship', 
  ' race':'race', 
  ' sex':'sex',
  'capital-gain': 'capital gain', 
  ' capital-loss': 'capital loss', 
  ' native-country': 'country',
  ' hours-per-week': 'hours per week',
  ' marital-status': 'marital',
   " income": 'income'}, inplace=True)
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital',
       'occupation', 'relationship', 'race', 'sex', ' capital-gain',
       'capital loss', 'hours per week', 'country', 'income'],
      dtype='object')

In [6]:
df["target"] = np.where(df["income"] == ' <=50K', 0, 1)
df.drop("income",axis=1, inplace=True)

In [7]:
X = df.iloc[:,:-1]
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train.shape

(21113, 14)

In [8]:
#df = df.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [9]:
X.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital           object
occupation        object
relationship      object
race              object
sex               object
 capital-gain      int64
capital loss       int64
hours per week     int64
country           object
dtype: object

In [10]:
categorical = list(X.select_dtypes('object').columns)
print(f"Categorical columns are: {categorical}")
numerical = list(X.select_dtypes('number').columns)
print(f"Numerical columns are: {numerical}")

Categorical columns are: ['workclass', 'education', 'marital', 'occupation', 'relationship', 'race', 'sex', 'country']
Numerical columns are: ['age', 'fnlwgt', 'education-num', ' capital-gain', 'capital loss', 'hours per week']


In [11]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categoric_transformer = Pipeline(steps=[('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[('numeric_preprocessor', numeric_transformer, numerical), 
                                              ('categorical_preprocessor', categoric_transformer, categorical)])

In [13]:
pipeline_log = Pipeline(steps=[('preprocessor', preprocessor), 
                           ('LogRegressor', LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000))])

In [14]:
log_model = pipeline_log.fit(X_train, y_train)
prediction_log = log_model.predict(X_test)
cl_report_log = classification_report(y_test, prediction_log)
print(cl_report_log)

              precision    recall  f1-score   support

           0       0.87      0.92      0.90      6764
           1       0.73      0.60      0.66      2285

    accuracy                           0.84      9049
   macro avg       0.80      0.76      0.78      9049
weighted avg       0.84      0.84      0.84      9049



In [17]:
pipeline_random = Pipeline(steps=[('preprocessor', preprocessor), 
                           ('ForestClassifier', RandomForestClassifier())])
random_model = pipeline_random.fit(X_train, y_train)
prediction_random = random_model.predict(X_test)
cl_report_random = classification_report(y_test, prediction_random)
print(cl_report_random)

              precision    recall  f1-score   support

           0       0.88      0.92      0.90      6764
           1       0.73      0.62      0.67      2285

    accuracy                           0.84      9049
   macro avg       0.80      0.77      0.78      9049
weighted avg       0.84      0.84      0.84      9049



In [None]:
# numerical_transformer = Pipeline(
#     steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
# )
# categorical_transformer = OneHotEncoder(handle_unknown="ignore")

# preprocessor = ColumnTransformer(
#     transformers=[
#         ("num", numerical_transformer, numerical),
#         ("cat", categorical_transformer, categorical),
#         ("dummy_col", OneHotEncoder(categories=[['Male', 'Female'],
#                                            ['Yes', 'No'],
#                                             ['0','1', '2','3+'],
#                                             ['Graduate', 'Not Graduate'],
#                                             ['No', 'Yes'],
#                                             ['Semiurban', 'Urban', 'Rural']]), [0,1,2,3,4,10]),
#       ("norm", Normalizer(norm='l1'), [5,6,7,8,9])
#     ]
# )



In [None]:
# clf = Pipeline(
#     steps=[('numerical_transformer', numerical_transformer),("preprocessor", preprocessor), ("classifier", LogisticRegression(random_state=42, max_iter=1000))])

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# X_train_trans = colTrans.fit_transform(X_train)

# clf.fit(X_train, y_train)
# clf.fit(X_train,y_train)
# clf.score(X_test,y_test)
# print("model score: %.3f" % clf.score(X_test, y_test))


ValueError: invalid literal for int() with base 10: 'Male'