In [None]:
import joblib
import logging
import argparse
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from starter.ml import process_data, inference, compute_model_metrics, train_model

In [None]:
import json
import pandas as pd
from pydantic import BaseModel, Field

In [35]:
class Data(BaseModel):
    age:            int
    workclass:      str
    fnlgt:          int
    education:      str
    education_num:  int = Field(alias="education-num")
    marital_status: str = Field(alias="marital-status")
    occupation:     str
    relationship:   str
    race:           str
    sex:            str
    capital_gain:   int = Field(alias="capital-gain")
    capital_loss:   int = Field(alias="capital-loss")
    hours_per_week: int = Field(alias="hours-per-week")
    native_country: str = Field(alias="native-country")
    
    class Config:
        allow_population_by_field_name = True

In [None]:
data = Data(
    age = 33,
    workclass = 'Private',
    fnlgt = 185908,
    education = 'Bachelors',
    education_num = 13,
    marital_status = 'Married-civ-spouse',
    occupation = 'Exec-managerial',
    relationship = 'Husband',
    race = 'Black',
    sex = 'Male',
    capital_gain = 0,
    capital_loss = 0,
    hours_per_week = 55,
    native_country = 'United-States'
)

In [None]:
data = Data(
    age = 45,
    workclass = 'State-gov',
    fnlgt = 50567,
    education = 'HS-grad',
    education_num = 9,
    marital_status = 'Married-civ-spouse',
    occupation = 'Exec-managerial',
    relationship = 'Wife',
    race = 'White',
    sex = 'Female',
    capital_gain = 0,
    capital_loss = 0,
    hours_per_week = 40,
    native_country = 'United-States'
)

In [None]:
df = pd.DataFrame(data.dict(by_alias=True), index=[0])

In [None]:
X, y, encoder, lb = process_data(
    df,
    categorical_features = cat_features,
    encoder=pipe['encoder'],
    lb = pipe['lb'],
    training=False
)

In [None]:
X

In [None]:
def inference(model, X):
    """
    Run model inferences and return the predictions.

    Inputs
    ------
    model : ???
        Trained machine learning model.
    X : np.array
        Data used for prediction.
    Returns
    -------
    preds : np.array
        Predictions from the model.
    """
    
    X, _y, _encoder, _lb = process_data(
        X,
        categorical_features = cat_features,
        encoder=model['encoder'],
        lb = model['lb'],
        training=False
    )

    preds = model['predictor'].predict(X)
    
    return preds

In [None]:
inference(pipe, df) == 0

---

In [27]:
import joblib
import logging
import argparse
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from starter import process_data, inference, compute_model_metrics, train_model

data = pd.read_csv('data/census.csv')
train, test = train_test_split(data, test_size=0.3, random_state=42)

cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country"
]

X_train, y_train, encoder, lb = process_data(
    train,
    categorical_features=cat_features,
    label="salary",
    training=True
)

X_test, y_test, _encoder, _lb = process_data(
    test,
    categorical_features=cat_features,
    label="salary",
    training=False,
    encoder=encoder,
    lb=lb
)

In [33]:
X.shape[1]

108

In [None]:
# Train and save a model.
model = train_model(X_train, y_train, 42)

In [None]:
model

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
pipe = {
    'encoder':encoder,
    'lb':lb,
    'predictor':model
}

In [None]:
pipe['lb']

In [None]:
pipe.predict(df)

In [None]:
X_categorical = encoder.transform(df[cat_features].values)

In [None]:
X_categorical

In [1]:
import joblib
import pandas as pd
from fastapi import FastAPI
from pydantic import BaseModel, Field

from starter import inference, process_data

In [2]:
encoder = joblib.load('model/encoder.pkl')
model = joblib.load('model/model.pkl')
lb = joblib.load('model/lb.pkl')

In [6]:
class Data(BaseModel):
    age:            int
    workclass:      str
    fnlgt:          int
    education:      str
    education_num:  int = Field(alias="education-num")
    marital_status: str = Field(alias="marital-status")
    occupation:     str
    relationship:   str
    race:           str
    sex:            str
    capital_gain:   int = Field(alias="capital-gain")
    capital_loss:   int = Field(alias="capital-loss")
    hours_per_week: int = Field(alias="hours-per-week")
    native_country: str = Field(alias="native-country")

    class Config:
        allow_population_by_field_name = True

In [36]:
data_true = Data(
    age = 33,
    workclass = 'Private',
    fnlgt = 185908,
    education = 'Bachelors',
    education_num = 13,
    marital_status = 'Married-civ-spouse',
    occupation = 'Exec-managerial',
    relationship = 'Husband',
    race = 'Black',
    sex = 'Male',
    capital_gain = 0,
    capital_loss = 0,
    hours_per_week = 55,
    native_country = 'United-States'
)

data_false = Data(
    age = 45,
    workclass = 'State-gov',
    fnlgt = 50567,
    education = 'HS-grad',
    education_num = 9,
    marital_status = 'Married-civ-spouse',
    occupation = 'Exec-managerial',
    relationship = 'Wife',
    race = 'White',
    sex = 'Female',
    capital_gain = 0,
    capital_loss = 0,
    hours_per_week = 40,
    native_country = 'United-States'
)

In [37]:
df = pd.DataFrame(data_true.dict(by_alias=True), index=[0])

In [9]:
# Processing data
X, _y, _encoder, _lb = process_data(
    df,
    categorical_features=encoder['features'],
    encoder=encoder['encoder'],
    lb=lb,
    training=False
)

In [10]:
inference(model, X)

array([1])

In [42]:
class Data(BaseModel):
    age:            int
    workclass:      str
    fnlgt:          int
    education:      str
    education_num:  int = Field(alias="education-num")
    marital_status: str = Field(alias="marital-status")
    occupation:     str
    relationship:   str
    race:           str
    sex:            str
    capital_gain:   int = Field(alias="capital-gain")
    capital_loss:   int = Field(alias="capital-loss")
    hours_per_week: int = Field(alias="hours-per-week")
    native_country: str = Field(alias="native-country")
    salary:         str

    class Config:
        allow_population_by_field_name = True


data = Data(
    age = 45,
    workclass = 'State-gov',
    fnlgt = 50567,
    education = 'HS-grad',
    education_num = 9,
    marital_status = 'Married-civ-spouse',
    occupation = 'Exec-managerial',
    relationship = 'Wife',
    race = 'White',
    sex = 'Female',
    capital_gain = 0,
    capital_loss = 0,
    hours_per_week = 40,
    native_country = 'United-States',
    salary = '<=50K'
)

df = pd.DataFrame(data.dict(by_alias=True), index=[0])

In [43]:
X_train, y_train, encoder, lb = process_data(
    df,
    categorical_features=cat_features,
    label="salary",
    training=True
)

In [49]:
X_train.shape

(1, 14)

In [53]:
y_train.shape == (1,)

True

In [55]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

NameError: name 'OneHotEncoder' is not defined

In [63]:
import sklearn
type(lb) == sklearn.preprocessing._label.LabelBinarizer

True

In [65]:
encoder