In [22]:
import pandas as pd
import numpy as np

In [23]:
# Load the data
df = pd.read_csv('data/census.csv', sep=',\s', engine='python')

# Display the first few rows of the DataFrame to verify that it loaded correctly
df.head()


Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [24]:
# Replace '?' with NaN
df.replace('?', np.nan, inplace=True)

In [25]:
# Drop rows with missing values
df.dropna(inplace=True)

In [27]:
# Strip leading and trailing spaces from string columns
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [30]:
# Save the cleaned DataFrame to a new CSV file
df.to_csv('data/cleaned_census_income.csv', index=False)

In [5]:
from sklearn.model_selection import train_test_split
import ml.clean_data
from ml.data import process_data
from ml.model import train_model, inference,compute_model_metrics,process_slices
import logging
import joblib

ModuleNotFoundError: No module named 'ml'

In [None]:
# log config 
logging.basicConfig(filename='logs/log',level=logging.INFO,filemode='w')

# Add code to load in the data.
data = clean_data.cleaned_data()

# Optional enhancement, use K-fold cross validation instead of a train-test split.
train, test = train_test_split(data, test_size=0.20)
logging.info("Train Test data split done")

cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]

X_train, y_train, encoder, lb = process_data(
    train, categorical_features=cat_features, label="salary", training=True
)

# Train and save a model.
model = train_model(X_train,y_train)
joblib.dump(model,'model/trained_model.joblib')



In [6]:
# Proces the test data with the process_data function.
X_test, y_test, encoder_test, lb_test = process_data(
    test, categorical_features=cat_features, label="salary", training=False, encoder=encoder, lb=lb
)

joblib.dump(encoder_test,'model/encoder.joblib')
joblib.dump(lb_test,'model/lb.joblib')

pred = inference(model,X_test)

precision, recall, fbeta = compute_model_metrics(y_test,pred)

logging.info(f"Precision: {precision}, Recall: {recall}, Fbeta: {fbeta}")


In [7]:
# performance of model on slices of dat
slice_metrics = process_slices(test, model, cat_features, 'education', encoder, lb)

logging.info(f"slice_metrics: {slice_metrics}")