# Table of content

[Step 1: Reading the data set](#Step-1:-Reading-the-data-set)
[Step 2: Data processing](#Step-2:-Data-preprocessing)
    - [2.1 Construction of the Dataframe](###2.1-Construction-of-the-Dataframe)
    - [2.2 Input preprocessing](###2.2-Input-preprocessing)
    - [2.3 Define categorical & numerical columns](###2.3-Define-categorical-&-numerical-columns)
    - [2.4 Preprocessing pipelines](###2.4-Preprocessing-pipelines)
    - [2.5 Model: Logistic Regression in a Pipeline](###2.5-Model:-Logistic-Regression-in-a-Pipeline)
    - [2.6 Train test split](###2.6-Train-test-split)
[Step 3: Model train](#Step-3:-Model-train)
[Step 4: Recommendation system](#Step-4:-Recommendation-system)
[Step 5: Saving the model](#Step-5:-Saving-the-model)

In [46]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pickle
import warnings
def warn(*args, **kwargs):
    pass
warnings.warn = warn
warnings.filterwarnings("ignore", category=FutureWarning)

# Step 1: Reading the data set

In [47]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
  
# metadata 
print(adult.metadata) 
  
# variable information 
print(adult.variables) 


{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether annual income of an individual exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Tue Sep 24 2024', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': "Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the fol

# Step 2: Data processing

### 2.1 Construction of the Dataframe

In [48]:
target = "y"

df = X.copy()
df[target] = y.iloc[:,0]
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,y
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [49]:
df.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,y
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48838,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.
48841,35,Self-emp-inc,182148,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,60,United-States,>50K.


### 2.2 Input preprocessing

In [50]:
y = (df[target].str.lower().eq(">50k.")).astype(int)
X = df.drop(columns = [target])
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [51]:
y

0        0
1        0
2        0
3        0
4        0
        ..
48837    0
48838    0
48839    0
48840    0
48841    1
Name: y, Length: 48842, dtype: int64

### 2.3 Define categorical & numerical columns

In [52]:
num_features = ["fnlwgt", "age", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
cat_features = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]

### 2.4 Preprocessing pipelines

In [53]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipe = Pipeline(
    [
      ("imputer", SimpleImputer(strategy = "median")),
      ("scaler", StandardScaler())
    ]
)

In [54]:
from sklearn.preprocessing import OneHotEncoder

cat_pipe = Pipeline(
    [
      ("imputer", SimpleImputer(strategy = "most_frequent")),
      ("onehot", OneHotEncoder(handle_unknown = "ignore"))
    ]
)

In [55]:
from sklearn.compose import ColumnTransformer

pre = ColumnTransformer(
    [
        ("num", num_pipe, num_features),
        ("cat", cat_pipe, cat_features),
    ]
)

### 2.5 Model: Logistic Regression in a Pipeline

In [None]:
model = Pipeline(
    [
        ("pre", pre), 
        ("clf", LogisticRegression(max_iter = 1000, class_weight = "balanced"))
    ]
)

### 2.6 Train test split

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)
X_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
9507,24,Private,113466,HS-grad,9,Never-married,Sales,Not-in-family,White,Male,0,0,40,United-States
22732,28,Private,192384,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States
32906,27,Private,106562,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,50,United-States
34036,49,Private,248145,HS-grad,9,Married-civ-spouse,Craft-repair,Other-relative,White,Male,0,0,40,Nicaragua
11568,32,Private,313729,HS-grad,9,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,United-States


# Step 3: Model train

In [58]:
model.fit(X_train, y_train)

0,1,2
,steps,"[('pre', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


# Step 4: Recommendation system

In [78]:
probs = model.predict_proba(X_test)[:,1]
THRESHOLD = 0.80

pred = (probs >= THRESHOLD).astype(int)
pred.sum()

np.int64(846)

We set the threshold at 80%, this allows us to estimate how many people in the entire dataset are expected to surpass that income.

In [90]:
def income_more_fifty(profile, threshold_top = 0.85, threshold_mid = 0.60):
  df_profile = pd.DataFrame([profile])
  prob = model.predict_proba(df_profile)[0, 1]
  if prob >= threshold_top:
    print("High probability: Income higher than 50K.")
  elif prob >= threshold_mid:
    print("Moderate probability: Maybe income higher than 50k.")
  else:
    print("Low probability: Income no higher than 50K.")

In [91]:
sample = {
    "age": 34,
    "workclass": "Private",
    "fnlwgt": 157249,
    "education": "Bachelors",
    "education-num": 13,
    "marital-status": "Married-civ-spouse",
    "occupation": "Sales",
    "relationship": "Wife",
    "race": "Black",
    "sex": "Female",
    "capital-gain": 0,
    "capital-loss": 1977,
    "hours-per-week": 20,
    "native-country": "Nicaragua"
}

income_more_fifty(sample)

Moderate probability: Maybe income higher than 50k.


In our sample, we used the profile of a married Black female working in Sales from Nicaragua. Our tests showed that the education level is the key factor in determining whether the income exceeds $50K. At levels 13 to 16, there is a moderate probability of earning more than $50K, whereas once the education level reaches 17 or higher, the probability of surpassing $50K per year becomes high.

# Step 5: Saving the model

In [92]:
with open("income_model.pkl", "wb") as f:
    pickle.dump(model, f)