### Config

In [1]:
from pathlib import Path

In [2]:
# Path specifications
PROJECT_NAME = 'costa_rica_poverty'
PACKAGE_ROOT = Path().resolve().parent
INPUT_DIR = f"{PACKAGE_ROOT}/{PROJECT_NAME}/packages/model/model/input/data"
ASSET_DIR = f"{PACKAGE_ROOT}/{PROJECT_NAME}/packages/model/model/assets"
OUTPUT_DIR = f"{PACKAGE_ROOT}/{PROJECT_NAME}/packages/model/model/output"

# Data file specifications
CODEBOOK = 'codebook.csv'
DATA_FILE = 'train.csv'

### Data cleaning: test drafting

In [3]:
import pandas as pd
import numpy as np

In [4]:
codebook = pd.read_csv(f"{ASSET_DIR}/{CODEBOOK}")
df = pd.read_csv(f"{INPUT_DIR}/{DATA_FILE}")
print(df.shape)

(9557, 143)


In [5]:
################## Head of households

#### TEST
def test_head_of_household_exists(raw_data):
    # Given
    hh_head = df.groupby('idhogar')['parentesco1'].sum().reset_index()
    
    # When
    hh_head_none = hh_head[hh_head['parentesco1']!=1]
    
    # Then
    assert hh_head_none.shape[0]==0
    
#### FIX
class HeadOfHouseholdExist(BaseEstimator, TransformerMixin):
    def __init__(self, variables: List[str]) -> None:
        '''
        Specify variable input as list of strings representing column names of a pd.DataFrame.
        '''
        # YOUR CODE HERE
        return None
    
    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        '''
        Required method for Sklearn TransformerMixin class.
        Remains inactive and performs no action for now.
        Leave as is.
        '''
        return self

    def transform(self, X: pd.DataFrame, y: pd.Series = None):
        '''
        Creates a copy of input dataframe, which is passed to method via the sklearn Pipeline.
        This method performs changes to the dataframe with reference to specified variables.
        The modified copy of the original dataframe is then returned and passed to the next step in pipeline.
        Define your specific transformation here.
        '''
        X = X.copy()
        # YOUR CODE HERE
        hh_head = df.groupby('idhogar')['parentesco1'].sum().reset_index()
        hh_head_none= hh_head[hh_head['parentesco1']!=1]
        hh_head_none
        
        X = X[~X['idhogar'].isin(hh_head_none['idhogar'].values)]

        return X

#### TEST
def test_household_target_match(raw_data):
    # Given
    householdTargetsMatching = (df.groupby('idhogar')['Target'].nunique()==1).reset_index()
    
    # When
    householdTargetMisMatch = householdTargetsMatching[householdTargetsMatching['Target']==False]
    
    # Then
    assert householdTargetMisMatch.shape[0]==0

    
#### FIX
class HouseholdTargetsMatch(BaseEstimator, TransformerMixin):
    def __init__(self, variables: List[str]) -> None:
        '''
        Specify variable input as list of strings representing column names of a pd.DataFrame.
        '''
        # YOUR CODE HERE
        return None
    
    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        '''
        Required method for Sklearn TransformerMixin class.
        Remains inactive and performs no action for now.
        Leave as is.
        '''
        return self

    def transform(self, X: pd.DataFrame, y: pd.Series = None):
        '''
        Creates a copy of input dataframe, which is passed to method via the sklearn Pipeline.
        This method performs changes to the dataframe with reference to specified variables.
        The modified copy of the original dataframe is then returned and passed to the next step in pipeline.
        Define your specific transformation here.
        '''
        X = X.copy()
        # Identify household target mismatches
        householdTargetsMatching = (X.groupby('idhogar')['Target'].nunique()==1).reset_index()
        householdTargetMisMatch = householdTargetsMatching[householdTargetsMatching['Target']==False]
        
        # set all member targets to head of household target
        for household in householdTargetMisMatch['idhogar']:
            correct_poverty_level = int(X[(X['idhogar'] == household) & (X['parentesco1'] == 1)]['Target'])
            X.loc[df['idhogar'] == household, 'Target'] = correct_poverty_level

        return X


NameError: name 'BaseEstimator' is not defined

In [None]:
#### FIX
class BinaryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, variables: List[str]) -> None:
        '''
        Specify variable input as list of strings representing column names of a pd.DataFrame.
        '''
        # YOUR CODE HERE
        return None
    
    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        '''
        Required method for Sklearn TransformerMixin class.
        Remains inactive and performs no action for now.
        Leave as is.
        '''
        return self

    def transform(self, X: pd.DataFrame, y: pd.Series = None):
        '''
        Creates a copy of input dataframe, which is passed to method via the sklearn Pipeline.
        This method performs changes to the dataframe with reference to specified variables.
        The modified copy of the original dataframe is then returned and passed to the next step in pipeline.
        Define your specific transformation here.
        '''
        X = X.copy()
        
        string_to_binary = {"yes": 1, "no": 0}

        X['edjefa'] = X['edjefa'].replace(string_to_binary).astype(float)
        X['edjefe'] = X['edjefe'].replace(string_to_binary).astype(float)
        X['dependency'] = X['dependency'].replace(string_to_binary).astype(float)


        return X

In [18]:
df['male']

0       1
1       1
2       0
3       1
4       0
       ..
9552    1
9553    0
9554    0
9555    0
9556    1
Name: male, Length: 9557, dtype: int64

In [16]:
df.select_dtypes(include=[object])

Unnamed: 0,Id,idhogar
0,ID_279628684,21eb7fcc1
1,ID_f29eb3ddd,0e5d7a658
2,ID_68de51c94,2c7317ea8
3,ID_d671db89c,2b58d945f
4,ID_d56d6f5f5,2b58d945f
...,...,...
9552,ID_d45ae367d,d6c086aa3
9553,ID_c94744e07,d6c086aa3
9554,ID_85fc658f8,d6c086aa3
9555,ID_ced540c61,d6c086aa3


In [None]:
class MissingValuesEvaluator:
    pass

class VariableDistributionEvaluator:
    pass