In [24]:
# variables
FEATURES = [
    "MSSubClass",
    "MSZoning",
    "Neighborhood",
    "OverallQual",
    "OverallCond",
    "YearRemodAdd",
    "RoofStyle",
    "MasVnrType",
    "BsmtQual",
    "BsmtExposure",
    "HeatingQC",
    "CentralAir",
    "1stFlrSF",
    "GrLivArea",
    "BsmtFullBath",
    "KitchenQual",
    "Fireplaces",
    "FireplaceQu",
    "GarageType",
    "GarageFinish",
    "GarageCars",
    "PavedDrive",
    "LotFrontage",
    # this one is only to calculate temporal variable:
    "YrSold",
]

# this variable is to calculate the temporal variable,
# can be dropped afterwards
DROP_FEATURES = "YrSold"

# numerical variables with NA in train set
NUMERICAL_VARS_WITH_NA = ["LotFrontage"]

# categorical variables with NA in train set
CATEGORICAL_VARS_WITH_NA = [
    "MasVnrType",
    "BsmtQual",
    "BsmtExposure",
    "FireplaceQu",
    "GarageType",
    "GarageFinish",
]

TEMPORAL_VARS = "YearRemodAdd"

# variables to log transform
NUMERICALS_LOG_VARS = ["LotFrontage", "1stFlrSF", "GrLivArea"]

# categorical variables to encode
CATEGORICAL_VARS = [
    "MSZoning",
    "Neighborhood",
    "RoofStyle",
    "MasVnrType",
    "BsmtQual",
    "BsmtExposure",
    "HeatingQC",
    "CentralAir",
    "KitchenQual",
    "FireplaceQu",
    "GarageType",
    "GarageFinish",
    "PavedDrive",
]

TARGET = "SalePrice"

NUMERICAL_NA_NOT_ALLOWED = [
    feature
    for feature in FEATURES
    if feature not in CATEGORICAL_VARS + NUMERICAL_VARS_WITH_NA
]

CATEGORICAL_NA_NOT_ALLOWED = [
    feature for feature in CATEGORICAL_VARS if feature not in CATEGORICAL_VARS_WITH_NA
]

DATASET_DIR = '/home/eashan/Eashan/Data_Science/STG/NLP/nlp/project_multilingual/french_classifier/custom_pipeline/pipeline/leposte_6.4/le_poste/le_poste'

TRAINED_MODEL_DIR = '/home/eashan/Eashan/Data_Science/STG/NLP/nlp/project_multilingual/french_classifier/custom_pipeline/pipeline/leposte_6.4/le_poste/le_poste'

In [25]:
import pandas as pd
import joblib
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split

In [26]:
def load_dataset(*, file_name: str) -> pd.DataFrame:
    _data = pd.read_csv(f"{config.DATASET_DIR}/{file_name}")
    return _data


def save_pipeline(*, pipeline_to_persist) -> None:
    """Persist the pipeline."""

    save_file_name = "regression_model.pkl"
    save_path = config.TRAINED_MODEL_DIR / save_file_name
    joblib.dump(pipeline_to_persist, save_path)

    print("saved pipeline")


def load_pipeline(*, file_name: str) -> Pipeline:
    """Load a persisted pipeline."""

    file_path = config.TRAINED_MODEL_DIR / file_name
    saved_pipeline = joblib.load(filename=file_path)
    return saved_pipeline


In [27]:
class CategoricalImputer(BaseEstimator, TransformerMixin):
    """Categorical data missing value imputer."""

    def __init__(self, variables=None) -> None:
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "CategoricalImputer":
        """Fit statement to accomodate the sklearn pipeline."""

        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Apply the transforms to the dataframe."""

        X = X.copy()
        for feature in self.variables:
            X[feature] = X[feature].fillna("Missing")

        return X

In [28]:
 _data = pd.read_csv('/home/eashan/Eashan/Data_Science/STG/NLP/nlp/project_multilingual/french_classifier/custom_pipeline/pipeline/leposte_6.4/le_poste/le_poste/houseprice.csv')

In [29]:
_data.head(3)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500


In [40]:
# _data.shape, len(_data.columns), _data.columns, len(FEATURES)

In [47]:
X_train, X_test, y_train, y_test = train_test_split(
        _data[FEATURES], _data[TARGET], test_size=0.1, random_state=0)

In [48]:
type(X_train), type(y_train)

(pandas.core.frame.DataFrame, pandas.core.series.Series)

In [49]:
X_train.head(3)

Unnamed: 0,MSSubClass,MSZoning,Neighborhood,OverallQual,OverallCond,YearRemodAdd,RoofStyle,MasVnrType,BsmtQual,BsmtExposure,...,BsmtFullBath,KitchenQual,Fireplaces,FireplaceQu,GarageType,GarageFinish,GarageCars,PavedDrive,LotFrontage,YrSold
930,20,RL,Timber,8,5,2007,Gable,,Gd,Av,...,0,Gd,0,,Attchd,Fin,3,Y,73.0,2009
656,20,RL,NAmes,5,7,2006,Gable,BrkFace,TA,No,...,1,Gd,0,,Attchd,RFn,1,Y,72.0,2008
45,120,RL,NridgHt,9,5,2005,Hip,BrkFace,Ex,No,...,1,Ex,1,Gd,Attchd,RFn,2,Y,61.0,2010


In [64]:
obj = CategoricalImputer(FEATURES)

In [65]:
obj.fit(X_train.iloc[:2])

CategoricalImputer(variables=['MSSubClass', 'MSZoning', 'Neighborhood',
                              'OverallQual', 'OverallCond', 'YearRemodAdd',
                              'RoofStyle', 'MasVnrType', 'BsmtQual',
                              'BsmtExposure', 'HeatingQC', 'CentralAir',
                              '1stFlrSF', 'GrLivArea', 'BsmtFullBath',
                              'KitchenQual', 'Fireplaces', 'FireplaceQu',
                              'GarageType', 'GarageFinish', 'GarageCars',
                              'PavedDrive', 'LotFrontage', 'YrSold'])

In [66]:
X = obj.transform(X_train.iloc[:2])

In [67]:
type(X)

pandas.core.frame.DataFrame

In [68]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,Neighborhood,OverallQual,OverallCond,YearRemodAdd,RoofStyle,MasVnrType,BsmtQual,BsmtExposure,...,BsmtFullBath,KitchenQual,Fireplaces,FireplaceQu,GarageType,GarageFinish,GarageCars,PavedDrive,LotFrontage,YrSold
930,20,RL,Timber,8,5,2007,Gable,,Gd,Av,...,0,Gd,0,Missing,Attchd,Fin,3,Y,73.0,2009
656,20,RL,NAmes,5,7,2006,Gable,BrkFace,TA,No,...,1,Gd,0,Missing,Attchd,RFn,1,Y,72.0,2008


In [69]:
single_test_json = X_train[0:1].to_json(orient='records')

In [70]:
single_test_json

'[{"MSSubClass":20,"MSZoning":"RL","Neighborhood":"Timber","OverallQual":8,"OverallCond":5,"YearRemodAdd":2007,"RoofStyle":"Gable","MasVnrType":"None","BsmtQual":"Gd","BsmtExposure":"Av","HeatingQC":"Ex","CentralAir":"Y","1stFlrSF":1466,"GrLivArea":1466,"BsmtFullBath":0,"KitchenQual":"Gd","Fireplaces":0,"FireplaceQu":null,"GarageType":"Attchd","GarageFinish":"Fin","GarageCars":3,"PavedDrive":"Y","LotFrontage":73.0,"YrSold":2009}]'

In [74]:
dat = data = pd.read_json(single_test_json)

In [76]:
type(dat), dat.shape

(pandas.core.frame.DataFrame, (1, 24))

In [77]:
dat.head()

Unnamed: 0,1stFlrSF,BsmtExposure,BsmtFullBath,BsmtQual,CentralAir,FireplaceQu,Fireplaces,GarageCars,GarageFinish,GarageType,...,MSSubClass,MSZoning,MasVnrType,Neighborhood,OverallCond,OverallQual,PavedDrive,RoofStyle,YearRemodAdd,YrSold
0,1466,Av,0,Gd,Y,,0,3,Fin,Attchd,...,20,RL,,Timber,5,8,Y,Gable,2007,2009


In [71]:
obj.fit(single_test_json)

CategoricalImputer(variables=['MSSubClass', 'MSZoning', 'Neighborhood',
                              'OverallQual', 'OverallCond', 'YearRemodAdd',
                              'RoofStyle', 'MasVnrType', 'BsmtQual',
                              'BsmtExposure', 'HeatingQC', 'CentralAir',
                              '1stFlrSF', 'GrLivArea', 'BsmtFullBath',
                              'KitchenQual', 'Fireplaces', 'FireplaceQu',
                              'GarageType', 'GarageFinish', 'GarageCars',
                              'PavedDrive', 'LotFrontage', 'YrSold'])

In [72]:
x = obj.transform(single_test_json)

AttributeError: 'str' object has no attribute 'copy'