In [1]:
import pandas as pd
import numpy as np
from openpyxl import load_workbook
import yaml
import datetime
import os

In [2]:
base_path = r'C:/Users/aiman/Desktop/gh_konsistensi/'
yaml_path = 'ref/stb_config_all_2021.yml'

#http://localhost:8888/notebooks/Desktop/gh_konsistensi/src/stb/konsistensi_stb_master.ipynb# File paths
dataset_path = base_path + 'data/stb/dsB092021STB.xlsx'
reference_files = ['ref/oku.csv',
                   'ref/kewarganegaraan.csv',
                   'ref/kumpulan_etnik.csv', 
                   'ref/persekolahan.csv',
                   'ref/pendidikan_rasmi.csv',
                   'ref/pendidikan_rasmi_tertinggi_2022.csv', 
                   'ref/sijil_tertinggi.csv',
                   'ref/sijil_tertinggi_2022.csv',
                   'ref/status_code.csv', 
                   'ref/msic_code_detail_01.csv',
                   'ref/masco_code.csv',
                   'ref/negara_code.csv', 
                   'ref/institusi_pengajian.csv',
                   'ref/bidang_pengajian.csv']

file_paths = [dataset_path] + [base_path + file for file in reference_files]
yaml_file = base_path + yaml_path

In [3]:
def read_files(*file_paths):
    def read_csv(file_path):
        return pd.read_csv(file_path, encoding='unicode_escape', low_memory=False)

    def read_excel(file_path):
        workbook = load_workbook(filename=file_path)
        sheet_name = workbook.sheetnames[0]  # Get the name of the first sheet
        worksheet = workbook[sheet_name]
        data = list(worksheet.values)
        return pd.DataFrame(data[1:], columns=data[0])

    file_readers = {
        'csv': read_csv,
        'xlsx': read_excel,
        'xls': read_excel
    }
    
    predefined_names = {
        'dsB092021STB.xlsx': 'df',
        'oku.csv': 'df_oku',
        'kewarganegaraan.csv': 'df_kw',
        'kumpulan_etnik.csv': 'df_ket',
        'persekolahan.csv': 'df_persk',
        'pendidikan_rasmi.csv': 'df_pend',
        'pendidikan_rasmi_tertinggi_2022.csv': 'df_pend_22',
        'sijil_tertinggi.csv': 'df_sijil',
        'sijil_tertinggi_2022.csv': 'df_sijil_22',
        'status_code.csv': 'df_status',
        'msic_code_detail_01.csv': 'df_msic',
        'masco_code.csv': 'df_masco',
        'negara_code.csv': 'df_ngra',
        'institusi_pengajian.csv': 'df_ip',
        'bidang_pengajian.csv': 'df_fs'
    }
    
    for file_path in file_paths:
        filename = file_path.split('/')[-1]
        df_name = predefined_names[filename]
        file_format = filename.split('.')[-1]
        
        # Declare the dataframe name as global
        globals()[df_name] = file_readers[file_format](file_path)

# Call the function
read_files(*file_paths)

In [4]:
# Read the YAML file
with open(yaml_file, 'r') as file:
    config = yaml.load(file, Loader=yaml.FullLoader)

# Extract conditions from the YAML data
persekolahan = config['persekolahan']

# Create an empty dictionary to store the merged values
merged_conditions = {}

# Loop over the persekolahan dictionaries and merge the values
for key, conditions in persekolahan.items():
    merged_conditions[key] = {}
    for condition_key, condition_value in conditions.items():
        if condition_key == 'U':
            if isinstance(condition_value, str) and condition_value.isdigit():
                condition_value = int(condition_value)
        merged_conditions[key][condition_key] = condition_value
        
for key in persekolahan:
    persekolahan[key]['U'] = list(eval(persekolahan[key]['U']))

In [5]:
# Extract the dataset name from the file path
dataset_name = os.path.basename(dataset_path)

# Extract the first three characters
doc_type = dataset_name[:3]

# Extract the characters at index 3 and 4
quarter_ref = dataset_name[3:5]

# Extract the characters from index 6 to 9
year_ref = int(dataset_name[5:9])

In [6]:
# Replace the string "None" (and its variants with potential spaces) with NaN for the entire DataFrame
for col in df.columns:
    df[col] = df[col].apply(lambda x: np.nan if str(x).strip() == 'None' else x)

In [None]:
# def format_columns(data, columns_to_format):
#     # Standardize columns with 6 digits and leading zeros
#     for column in columns_to_format['leading_zeros_6_digits']:
#         if column in data.columns:
#             data[column] = data[column].apply(lambda x: str(int(x)).zfill(6) if not pd.isnull(x) else x)
#             print(f"Formatted {column} to have 6 digits with leading zeros.")
#         else:
#             print(f"Column '{column}' not found in the DataFrame.")

#     # Standardize columns with 5 digits and leading zeros
#     for column in columns_to_format['leading_zeros_5_digits']:
#         if column in data.columns:
#             data[column] = data[column].apply(lambda x: str(int(x)).zfill(5) if not pd.isnull(x) else x)
#             print(f"Formatted {column} to have 5 digits with leading zeros.")
#         else:
#             print(f"Column '{column}' not found in the DataFrame.")
            
#     # Standardize columns with 5 digits and leading zeros
#     for column in columns_to_format['leading_zeros_4_digits']:
#         if column in data.columns:
#             data[column] = data[column].apply(lambda x: str(int(x)).zfill(4) if not pd.isnull(x) else x)
#             print(f"Formatted {column} to have 4 digits with leading zeros.")
#         else:
#             print(f"Column '{column}' not found in the DataFrame.")
            
#     # Standardize columns with 3 digits and leading zeros
#     for column in columns_to_format['leading_zeros_3_digits']:
#         if column in data.columns:
#             data[column] = data[column].apply(lambda x: str(int(x)).zfill(3) if not pd.isnull(x) else x)
#             print(f"Formatted {column} to have 3 digits with leading zeros.")
#         else:
#             print(f"Column '{column}' not found in the DataFrame.")
            
#     # Standardize columns with 2 digits and leading zeros
#     for column in columns_to_format['leading_zeros_2_digits']:
#         if column in data.columns:
#             data[column] = data[column].apply(lambda x: str(int(x)).zfill(2) if not pd.isnull(x) else x)
#             print(f"Formatted {column} to have 2 digits with leading zeros.")
#         else:
#             print(f"Column '{column}' not found in the DataFrame.")

#     # Convert columns to specific formats
#     for column, data_type in columns_to_format['data_types'].items():
#         if column in data.columns:
#             if data_type == int:
#                 data[column] = data[column].apply(lambda x: int(x) if pd.notnull(x) and str(x).isdigit() else x)
#                 print(f"Converted {column} to {data_type}.")
#             else:
#                 data[column] = data[column].astype(data_type)
#                 print(f"Converted {column} to {data_type}.")
#         else:
#             print(f"Column '{column}' not found in the DataFrame.")

#     # Standardize columns with 7 digits and two decimal places (string)
#     for column in columns_to_format['standardize_7_digits']:
#         if column in data.columns:
#             data[column] = data[column].apply(lambda x: f"{float(x):08.2f}" if pd.notnull(x) and (isinstance(x, float) or (isinstance(x, str) and x.replace('.', '', 1).isdigit())) else x)
#             print(f"Standardized {column} to have 7 digits and two decimal places.")
#         else:
#             print(f"Column '{column}' not found in the DataFrame.")

#     # Return the modified DataFrame
#     return data



# columns_to_format_01 = {
#     'leading_zeros_6_digits': [],
#     'leading_zeros_5_digits': ['S19', 'S18'],
#     'leading_zeros_4_digits': ['KW', 'NGRA', 'FS', 'MASCO_4D'],
#     'leading_zeros_3_digits': ['KW', 'NGRA', 'FS', 'MASCO_3D', 'NO KEL'],
#     'leading_zeros_2_digits': ['PKIS', 'BK', 'NGRI', 'PT', 'SJ', 'Grp_SJ', 'MASCO_2D'],
#     'standardize_7_digits': [],
#     'data_types': {
#         'P': int,
#         'U': int,
#         'PT': str,
#         'SJ': str,
#         'HMIS': int,
#         'HMWA': int,
#         'MSIC_1D': str,
#         'RIN': int,
#         'B': int,
#         'NG': int,
#         'DP': int,
#         'ST': int,
#     }
# }

# columns_to_format_02 = {
#     'leading_zeros_6_digits': ['KOD_MASCO'],
#     'leading_zeros_5_digits': [],
#     'leading_zeros_4_digits': [],
#     'leading_zeros_3_digits': [],
#     'leading_zeros_2_digits': [],
#     'standardize_7_digits': [],
#     'data_types': {
        
#     }
# }

# columns_to_format_03 = {
#     'leading_zeros_6_digits': [],
#     'leading_zeros_5_digits': ['KOD_MSIC'],
#     'leading_zeros_4_digits': [],
#     'leading_zeros_3_digits': [],
#     'leading_zeros_2_digits': [],
#     'standardize_7_digits': [],
#     'data_types': {
        
#     }
# }

# columns_to_format_04 = {
#     'leading_zeros_6_digits': ['Kod'],
#     'leading_zeros_5_digits': [],
#     'leading_zeros_4_digits': [],
#     'leading_zeros_3_digits': [],
#     'leading_zeros_2_digits': [],
#     'standardize_7_digits': [],
#     'data_types': {
        
#     }
# }

# formatted_data_01 = format_columns(df, columns_to_format_01)
# formatted_data_02 = format_columns(df_masco, columns_to_format_02)
# formatted_data_03 = format_columns(df_msic, columns_to_format_03)
# formatted_data_04 = format_columns(df_status, columns_to_format_04)

In [7]:
kw_list = df_kw["Kod"].astype(str).apply(lambda x: x.zfill(3)).tolist()
ket_list = list(map(int, df_ket.iloc[:97]["Kod"].values))
status_list = df_status["Kod"].values.tolist()
masco_list = df_masco["KOD_MASCO"].values.tolist()
msic_list = df_msic["KOD_MSIC"].values.tolist()
pkis_list = [str(i).zfill(2) for i in range(1, 13)]
no_kel_list = [str(i).zfill(3) for i in range(1, 1000)]
ngri_list = df_ngra["KOD"].values.tolist()

In [None]:
### Semakan Julat JR4 ###

kel_list = list(range(1000))
b_list = list(range(13))
ng_list = list(range(17))
dp_list = list(range(32))
db_list = list(range(170))
db2_list = set(str(i).zfill(5) for i in range(1, 100000))
st_list = list(range(10))
notk_list = list(range(1000))
noir_list = list(range(100))
t_list = list(range(100))
pkis_list = [str(i).zfill(2) for i in range(1, 13)]
hmis_list = [str(i).zfill(2) for i in range(1, 100)]
bk_list = [str(i).zfill(2) for i in range(1, 13)]
tk_list = [str(i).zfill(4) for i in range(1900, 3000)]
u_list = list(range(201))
tp_list = list(range(6))
ngri_list = [str(i).zfill(2) for i in range(1, 17)] + ['98']
ngra_list = df_ngra["KOD"].astype(str).apply(lambda x: x.zfill(3)).tolist()
oku_list = df_oku["Kod"].astype(str).apply(lambda x: x.zfill(2)).tolist()
pt_22_list = df_pend_22["KOD"].astype(str).apply(lambda x: x.zfill(3)).tolist()
sj_22_list = df_sijil_22["KOD"].astype(str).apply(lambda x: x.zfill(3)).tolist()
ip_list = df_ip["Kod"].tolist()
fs_list = df_fs["Kod"].astype(str).apply(lambda x: x.zfill(4)).tolist()
hmwa_list = list(range(100))

def validate_all_julats(row):
    row['JULAT_001'] = 1 if row['NOKEL'] in no_kel_list else 0
    row['JULAT_002'] = 1 if row['B'] in b_list else 0
    row['JULAT_003'] = 1 if row['NG'] in ng_list else 0
    row['JULAT_004'] = 1 if row['DP'] in dp_list else 0
    row['JULAT_005'] = 1 if row['DB'] in db_list else 0
    row['JULAT_006'] = 1 if row['BP'] in db_list else 0
    row['JULAT_007'] = 1 if row['BP2'] in db2_list else 0
    row['JULAT_008'] = 1 if row['ST'] in st_list else 0
    row['JULAT_009'] = 1 if row['NOTK'] in notk_list else 0
    row['JULAT_010'] = 1 if row['NOIR'] in noir_list else 0
    row['JULAT_011'] = 1 if row['JR'] == 4 else 0
    row['JULAT_012'] = 1 if isinstance(row['NAMA'], str) and len(row['NAMA']) <= 50 else 0
    row['JULAT_013'] = 1 if isinstance(row['NOIC'], str) and len(row['NOIC']) <= 12 or pd.isna(row['NOIC']) else 0
    row['JULAT_014'] = 1 if row['PKIS'] in pkis_list else 0
    row['JULAT_015'] = 1 if row['HMIS'] in hmis_list else 0
    row['JULAT_016'] = 1 if row['J'] in [1, 2] else 0
    row['JULAT_017'] = 1 if row['BK'] in bk_list else 0
    row['JULAT_018'] = 1 if row['TK'] in tk_list else 0
    row['JULAT_019'] = 1 if row['U'] in u_list else 0
    row['JULAT_020'] = 1 if row['KET'] in ket_list else 0
    row['JULAT_021'] = 1 if row['KW'] in kw_list else 0
    row['JULAT_022'] = 1 if row['TP'] in tp_list else 0
    row['JULAT_023'] = 1 if row['NGRI'] in pkis_list else 0
    row['JULAT_024'] = 1 if row['NGRA'] in ngra_list else 0
    row['JULAT_025'] = 1 if row['OKU'] in oku_list else 0
    row['JULAT_026'] = 1 if row['P'] in [1, 2, 3, 4] else 0
    row['JULAT_027'] = 1 if row['PT'] in pt_22_list else 0
    row['JULAT_028'] = 1 if row['SJ'] in sj_22_list else 0
    row['JULAT_029'] = 1 if row['IP'] in ip_list or pd.isna(row['IP']) else 0
    row['JULAT_030'] = 1 if row['FS'] in fs_list else 0
    row['JULAT_031'] = 1 if row['HMWA'] in hmwa_list else 0
    
    return row

df = df.apply(validate_all_julats, axis=1)

In [21]:
### Konsistensi 1(a) ###

# Create today date variable
# today = datetime.date.today()
# tahun_semasa = today.year

def validate_condition_01(row):
    """
    Validate the consistency between year of birth, age, and reference year.
    
    This function takes a row of a DataFrame as input, extracts the values of
    'TK' and 'U', converts them to integers, and checks whether the 
    expression (year_ref - TK - U) is greater than 3. If the expression 
    is true, it returns 0, indicating that the data is inconsistent. If the
    expression is false, it returns 1, indicating consistency.

    Parameters:
        row (pd.Series): A row of a DataFrame, expected to contain 
                         'TK' and 'U' keys with digit strings as values.
        
    Returns:
        int: Returns 1 if the data is consistent (i.e., (year_ref - TK - U) <= 3),
             and 0 otherwise.
    """
    # Calculate and check the condition (year_ref - TK - U) > 3.
    # If true, return 0 (inconsistent data). Otherwise, return 1 (consistent data).
    if (year_ref - int(row['TK']) - int(row['U'])) > 3:
        return 0
    else:
        return 1


df['KONSISTENSI_01a'] = df.apply(lambda row: validate_condition_01(row) if row['TK'].isdigit() and row['U'].isdigit() else 0, axis=1).astype(int)

  df['KONSISTENSI_01a'] = df.apply(lambda row: validate_condition_01(row) if row['TK'].isdigit() and row['U'].isdigit() else 0, axis=1).astype(int)


In [23]:
### Konsistensi 1(b) ###
def validate_condition_02(df):
    """
    Sequential in-place validation of the DataFrame based on multiple conditions without any return:
    1. Check if 'U' <= 15. If true, set 'KONSISTENSI_03' to 1.
    2. If 'U' > 15, check if 'HMWA' is not null.
    
    Adds a 'KONSISTENSI_03' column with 0 (False) or 1 (True) based on the conditions.
    
    Parameters:
    - df: Input DataFrame
    """
    # Initialize KONSISTENSI_03 column with 0
    df['KONSISTENSI_02'] = 0
    
    # Directly set rows where 'U' <= 15 to 1
    df.loc[df['U'].astype(int) <= 15, 'KONSISTENSI_02'] = 1
    
    # For rows where 'U' > 15 and 'HMWA' is not null, set 'KONSISTENSI_03' to 1
    combined_condition = (df['U'].astype(int) > 15) & ~df['HMWA'].isnull()
    df.loc[combined_condition, 'KONSISTENSI_02'] = 1

validate_condition_02(df)

  df['KONSISTENSI_02'] = 0


In [25]:
### Konsistensi 1(c) ###

# Define the list of allowed values for the 'NGRI' column
ngri_lst = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16']

# Main function
def validate_condition_03(df, values):
    # Create a new column 'KONSISTENSI_03' with initial value as 1 (Pass)
    df['KONSISTENSI_03'] = 1

    # Filter DataFrame for rows where 'NGRA' equals 458
    df_ngra = df[df['NGRA'] == '458']

    # Within this filtered DataFrame, find rows where 'NGRI' is not in the allowed values
    mask = ~df_ngra['NGRI'].isin(values)

    # Where the condition is met, update 'KONSISTENSI_03' to 0 (Fail) in the original DataFrame
    df.loc[df_ngra[mask].index, 'KONSISTENSI_03'] = 0

    return df

df = validate_condition_03(df, ngri_lst)

  df['KONSISTENSI_03'] = 1


In [27]:
### Konsistensi 1(d) ###
def validate_condition_04(df, oku_list):
    """
    Validate the DataFrame based on the provided list of OKU values:
    1. Check if the 'OKU' column contains a value present in oku_list.
    
    Adds a 'KONSISTENSI_04' column with 0 (False) or 1 (True) based on the condition.
    
    Parameters:
    - df: Input DataFrame
    - oku_list: List of valid OKU values
    
    Returns:
    - DataFrame with 'OKU_VALIDATION' column added
    """
    # Check if values in 'OKU' are present in oku_list and assign the results to 'OKU_VALIDATION'
    df['KONSISTENSI_04'] = df['OKU'].astype(int).isin(oku_list).astype(int)

oku_list = list(map(int, df_oku["Kod"].values))

validate_condition_04(df, oku_list)

  df['KONSISTENSI_04'] = df['OKU'].astype(int).isin(oku_list).astype(int)


In [31]:
### Konsistensi 1(e) ###

# Main function
def validate_condition_05a(df, kw):
    df['KONSISTENSI_05a'] = df['KW'].astype(str).isin(kw).astype(int)

validate_condition_05a(df, kw_list)

# Main function
def validate_condition_05b(df, ket):
    df['KONSISTENSI_05b'] = df['KET'].astype(int).isin(ket).astype(int)

validate_condition_05b(df, ket_list)

  df['KONSISTENSI_05a'] = df['KW'].astype(str).isin(kw).astype(int)
  df['KONSISTENSI_05b'] = df['KET'].astype(int).isin(ket).astype(int)


In [34]:
### Konsistensi 1(f) ###

# Main function
def validate_condition_06(df, merged_conditions):
    # Create a new column to store the result
    df['KONSISTENSI_06'] = 0

    # Loop over the rows in the dataframe
    for index, row in df.iterrows():
        # Check if the row matches any of the conditions
        for condition_key, condition_values in merged_conditions.items():
            # Flag to track if all conditions are met
            match = True

            # Check if all column-value pairs in the condition are present in the row
            for col, val in condition_values.items():
                # Check if the value is a list
                if isinstance(val, list):
                    if row[col] not in val:
                        match = False
                        break
                else:
                    if row[col] != val:
                        match = False
                        break

            # Set the result column to 1 if all conditions are met
            if match:
                df.loc[index, 'KONSISTENSI_06'] = 1
                break

    return df


df = validate_condition_06(df, persekolahan)

  df['KONSISTENSI_06'] = 0


In [None]:
### Konsistensi 1(g) ###

###
# Same as 1(f)
###




In [39]:
tp_list = list(range(6))

In [40]:
def validate_condition_08(df, tp_lst):
    """
    Sequential in-place validation of the DataFrame based on multiple conditions without any return:
    1. Check if 'U' >= 18. If true, set 'KONSISTENSI_08' to 1.
    2. If 'U' < 18, check if 'TP' is in tp_lst.
    
    Adds a 'KONSISTENSI_08' column with 0 (False) or 1 (True) based on the conditions.
    
    Parameters:
    - df: Input DataFrame
    - tp_lst: List of valid TP values
    """
    
    # Initialize KONSISTENSI_08 column with 0
    df['KONSISTENSI_08'] = 0
    
    # Directly set rows where 'U' >= 18 to 1
    df.loc[df['U'].astype(int) >= 18, 'KONSISTENSI_08'] = 1
    
    # For rows where 'U' < 18 and 'TP' is in tp_lst, set 'KONSISTENSI_08' to 1
    combined_condition = (df['U'].astype(int) < 18) & df['TP'].isin(tp_lst)
    df.loc[combined_condition, 'KONSISTENSI_08'] = 1

validate_condition_08(df, tp_list)

  df['KONSISTENSI_08'] = 0


In [42]:
def validate_condition_09(df):
    """
    Sequential in-place validation of the DataFrame based on multiple conditions without any return:
    1. Check if 'U' > 15 and 'HMWA' is not null.
    2. If the first condition is met, check if 'HMWA' and 'HMIS' values are equal.
    
    Adds a 'KONSISTENSI_09' column with 0 (False) or 1 (True) based on the conditions.
    
    Parameters:
    - df: Input DataFrame
    """
    # Initialize VALIDATION_RESULT column with 0
    df['KONSISTENSI_09'] = 0
    
    # Identify rows that meet the first condition
    condition_1 = (df['U'].astype(int) > 15) & ~df['HMWA'].isnull()
    
    # For rows that meet the first condition, check the second condition
    condition_2 = df['HMWA'] == df['HMIS']
    
    # Update VALIDATION_RESULT column for rows that meet both conditions
    df.loc[condition_1 & condition_2, 'KONSISTENSI_09'] = 1

validate_condition_09(df)

  df['KONSISTENSI_09'] = 0


In [43]:
df['KONSISTENSI_09'].value_counts()

KONSISTENSI_09
1    20343
0     6822
Name: count, dtype: int64

In [None]:
### T-C1a ###

# Main function
def validate_condition_TC1a(data):
    # Apply the condition filters
    filtered_data_TC1a = data[(data['MSIC_1D'] == 'O') &
                             ~((data['S19'] >= '84111') & (data['S19'] <= '84300'))].copy()

    # Create a new column 'KONSISTENSI_TC1a' and initialize it with 1 (pass)
    data['KONSISTENSI_TC1a'] = 1

    # Mark the rows that fail the condition as 0 (fail)
    data.loc[filtered_data_TC1a.index, 'KONSISTENSI_TC1a'] = 0

    return data

result = validate_condition_TC1a(df)

In [None]:
### T-C1b ###

def validate_condition_TC1b(data):
    # Apply the condition filters
    filtered_data_TC1b = data[(data['MSIC_1D'] == 'O') &
                            (data['S19'] >= '84111') & (data['S19'] <= '84300') & 
                            (data['KW'] != 458)].copy()

    # Create a new column 'validation_03' and initialize it with 1 (pass)
    data['KONSISTENSI_TC1b'] = 1

    # Mark the rows that fail the condition as 0 (fail)
    data.loc[filtered_data_TC1b.index, 'KONSISTENSI_TC1b'] = 0

    return data

result = validate_condition_TC1b(df)

In [None]:
df_msic = df[df['MSIC_1D'] == 'T']
df_msic.shape

In [None]:
def validate_condition_TC2(df):
    """
    Sequential in-place validation of the DataFrame based on multiple conditions without any return:
    1. Check if 'MSIC_1D' == T.
    2. If the first condition is met, check if 'S20' == 3.
    
    Adds a 'KONSISTENSI_TC2' column with 0 (False) or 1 (True) based on the conditions.
    
    Parameters:
    - df: Input DataFrame
    """
    # Initialize VALIDATION_RESULT column with 0
    df['KONSISTENSI_TC2'] = 1
    
    # Identify rows that meet the first condition
    condition_1 = (df['MSIC_1D'] == 'T')
    
    # For rows that meet the first condition, check the second condition
    condition_2 = df['S20'] != 3
    
    # Update VALIDATION_RESULT column for rows that meet both conditions
    df.loc[condition_1 & condition_2, 'KONSISTENSI_TC2'] = 1
    
validate_condition_TC2(df)

In [None]:
### T-C3 ###

def validate_condition_TC3(data):
    # Apply the condition filters
    filtered_data_04 = data[(data['MSIC_1D'] == 'P') & (data['S20'] != 2)].copy()

    # Create a new column 'validation_03' and initialize it with 1 (pass)
    data['KONSISTENSI_TC3'] = 1

    # Mark the rows that fail the condition as 0 (fail)
    data.loc[filtered_data_04.index, 'KONSISTENSI_TC3'] = 0

    return data

result = validate_condition_TC3(df)

In [None]:
### T-C4 ###

def validate_condition_TC4(data):
    # Apply the condition filters
    filtered_data_TC4 = data[(data['MSIC_1D'] == 'A') &
                            (data['MASCO_1D'] == 9) &
                            (~data['S20'].isin([3, 4, 5]))].copy()

    # Create a new column 'validation_03' and initialize it with 1 (pass)
    data['KONSISTENSI_TC4'] = 1

    # Mark the rows that fail the condition as 0 (fail)
    data.loc[filtered_data_TC4.index, 'KONSISTENSI_TC4'] = 0

    return data

result = validate_condition_TC4(df)

In [None]:
### T-C5a ###

def validate_condition_TC5a(data):
    data['SJ'] = pd.to_numeric(data['SJ'], errors='coerce')
    
    # Apply the condition filters
    filtered_data_TC5a = data[((data['S18'] >= '111101') &
                            (data['S18'] <= '291918')) &
                            (~(data['SJ'] >= 20) &
                            (data['SJ'] <= 242))].copy()

    # Create a new column 'validation_03' and initialize it with 1 (pass)
    data['KONSISTENSI_TC5a'] = 1

    # Mark the rows that fail the condition as 0 (fail)
    data.loc[filtered_data_TC5a.index, 'KONSISTENSI_TC5a'] = 0

    return data

result = validate_condition_TC5a(df)

In [None]:
### T-C5b ###

def validate_condition_TC5b(data):
    # Apply the condition filters
    filtered_data_TC5b = data[~(data['S20'] == 4) &
                            ((data['S18'] >= '111101') & (data['S18'] <= '291918'))].copy()

    # Create a new column 'validation_03' and initialize it with 1 (pass)
    data['KONSISTENSI_TC5b'] = 1

    # Mark the rows that fail the condition as 0 (fail)
    data.loc[filtered_data_TC5b.index, 'KONSISTENSI_TC5b'] = 0

    return data

result = validate_condition_TC5b(df)

In [None]:
### T-C6 ###

def validate_condition_TC6(data):
    # Apply the condition filters
    filtered_data_TC7 = data[(data['STATUS'] == 'GOV') &
                            ~(data['S20'] == 2)].copy()

    # Create a new column 'validation_03' and initialize it with 1 (pass)
    data['KONSISTENSI_TC6'] = 1

    # Mark the rows that fail the condition as 0 (fail)
    data.loc[filtered_data_TC7.index, 'KONSISTENSI_TC6'] = 0

    return data

result = validate_condition_TC6(df)

In [None]:
### T-C7 ###

def validate_condition_TC7(data, masco_list):
    # Apply the condition filters
    filtered_data_TC7 = data[(data['S18'].isin(masco_list)) & 
                              (data['S20'] != 2) &
                              (data['STATUS'] != 'GOV')].copy()

    # Create a new column 'validation_03' and initialize it with 1 (pass)
    data['KONSISTENSI_TC7'] = 1

    # Mark the rows that meet the condition as 0 (fail)
    data.loc[filtered_data_TC7.index, 'KONSISTENSI_TC7'] = 0

    return data  # return the dataframe

result = validate_condition_TC7(df, masco_list)

In [None]:
### T-C8a ###

def validate_condition_TC8a(data):
    # Apply the condition filters
    filtered_data_TC8a = data[(data['S20'] == 2) &
                            ~((data['S19'] >= '84111') & (data['S19'] <= '84300'))].copy()

    # Create a new column 'validation_03' and initialize it with 1 (pass)
    data['KONSISTENSI_TC8a'] = 1

    # Mark the rows that fail the condition as 0 (fail)
    data.loc[filtered_data_TC8a.index, 'KONSISTENSI_TC8a'] = 0

    return data

result = validate_condition_TC8a(df)

In [None]:
### T-C8b ###

def validate_condition_TC8b(data):
    # Apply the condition filters
    filtered_data_TC8b = data[(data['S20'] == 4) &
                              data['S19'].notna()].copy()

    # Create a new column 'validation_03' and initialize it with 1 (pass)
    data['KONSISTENSI_TC8b'] = 1

    # Mark the rows that fail the condition as 0 (fail)
    data.loc[filtered_data_TC8b.index, 'KONSISTENSI_TC8b'] = 0

    return data

result = validate_condition_TC8b(df)

In [None]:
### T-C9 ###

def validate_condition_TC9(data):
    # Apply the condition filters
    filtered_data_TC9 = data[(data['MSIC_1D'] == 'T') &
                            (data['PKIS'] != 11)].copy()

    # Create a new column 'validation_03' and initialize it with 1 (pass)
    data['KONSISTENSI_TC9'] = 1

    # Mark the rows that fail the condition as 0 (fail)
    data.loc[filtered_data_TC9.index, 'KONSISTENSI_TC9'] = 0

    return data

result = validate_condition_TC9(df)

In [None]:
### T-C10 ###

def validate_condition_TC10(data):
    # Apply the condition filters
    filtered_data_TC10 = data[((data['S19'] >= '98100') & (data['S19'] <= '98200')) &
                            ~(data['RIN'] != np.NaN)].copy()

    # Create a new column 'validation_03' and initialize it with 1 (pass)
    data['KONSISTENSI_TC10'] = 1

    # Mark the rows that fail the condition as 0 (fail)
    data.loc[filtered_data_TC10.index, 'KONSISTENSI_TC10'] = 0

    return data

result = validate_condition_TC10(df)

In [None]:
### T-C11 ###

def validate_condition_TC11(data):
    # Apply the condition filters
    filtered_data_TC11 = data[((data['RIN'] >= '1') & (data['RIN'] <= '5')) &
                            ~(data['S19'] != '99000')].copy()

    # Create a new column 'validation_03' and initialize it with 1 (pass)
    data['KONSISTENSI_TC11'] = 1

    # Mark the rows that fail the condition as 0 (fail)
    data.loc[filtered_data_TC11.index, 'KONSISTENSI_TC11'] = 0

    return data

result = validate_condition_TC11(df)

In [None]:
### T-C12 ###

def validate_condition_TC12(data):
    # Apply the condition filters
    filtered_data_TC12 = data[(data['MASCO_1D'] != np.NaN) &
                            (data['MSIC_1D'] != np.NaN) &
                            (~(data['S19'] >= '98100') & (data['S19'] <= '98200'))].copy()
    
    # Create a new column 'validation_03' and initialize it with 1 (pass)
    data['KONSISTENSI_TC12'] = 1

    # Mark the rows that fail the condition as 0 (fail)
    data.loc[filtered_data_TC12.index, 'KONSISTENSI_TC12'] = 0

    return data

result = validate_condition_TC12(df)

In [None]:
### T-C13 ###

def validate_condition_TC13(data):
    # Apply the condition filters
    filtered_data_TC13 = data[(data['KW'] == '458') &
                            ~(data['KET'].isin(ket_list))].copy()

    # Create a new column 'validation_03' and initialize it with 1 (pass)
    data['KONSISTENSI_TC13'] = 1

    # Mark the rows that fail the condition as 0 (fail)
    data.loc[filtered_data_TC13.index, 'KONSISTENSI_TC13'] = 0

    return data

result = validate_condition_TC13(df)

In [None]:
### T-C14 ###

# TC-14: Jika Jantina (J) Lelaki (kod 1), 
#     check PKIS = 01 (Ketua Isi Rumah) dan S10 = 02 (kerja rumah/ tanggungjawab keluarga). 
#     Jika Umur (U) >= 50 boleh terima, tapi jika U < 50, semak semula


def validate_condition_TC14(data):
    # Apply the condition filters
    filtered_data_TC14 = data[(data['J'] == '1') & 
                              (data['PKIS'] == '01') &
                              (data['S10'] == '2') & 
                              ~(data['U'] >= '50')].copy()

    # Create a new column 'validation_03' and initialize it with 1 (pass)
    data['KONSISTENSI_TC14'] = 1

    # Mark the rows that fail the condition as 0 (fail)
    data.loc[filtered_data_TC14.index, 'KONSISTENSI_TC14'] = 0

    return data

result = validate_condition_TC14(df)

In [None]:
def validate_condition_TC15a(data):
    # Apply the condition filters
    filtered_data_TC15a = data[(data['TP'] == 1) & 
                              (data['PKIS'].isin(['02', '04', '05', '07', '08']))].copy()

    # Create a new column 'validation_03' and initialize it with 1 (pass)
    data['KONSISTENSI_TC15a'] = 1

    # Mark the rows that fail the condition as 0 (fail)
    data.loc[filtered_data_TC15a.index, 'KONSISTENSI_TC15a'] = 0

    return data

result = validate_condition_TC15a(df)

################################################################################

def validate_condition_TC15b(data):
    # Apply the condition filters
    filtered_data_TC15b = data[(data['TP'].isin([2, 3, 4, 5])) & 
                              (data['U'] <= '17')].copy()

    # Create a new column 'validation_03' and initialize it with 1 (pass)
    data['KONSISTENSI_TC15b'] = 1

    # Mark the rows that fail the condition as 0 (fail)
    data.loc[filtered_data_TC15b.index, 'KONSISTENSI_TC15b'] = 0

    return data

result = validate_condition_TC15b(df)

In [None]:
def validate_condition_TC16a(data):
    # Apply the condition filters
    filtered_data_TC16a = data[(data['P'] == 1) &
                               (data['S10'].isin([1, 7])) &
                               (data['S15'].isin([1, 2, 3]))].copy()

    # Create a new column 'validation_03' and initialize it with 1 (pass)
    data['KONSISTENSI_TC16a'] = 1

    # Mark the rows that fail the condition as 0 (fail)
    data.loc[filtered_data_TC16a.index, 'KONSISTENSI_TC16a'] = 0

    return data

result = validate_condition_TC16a(df)

################################################################################

def validate_condition_TC16b(data):
    # Apply the condition filters
    filtered_data_TC16b = data[(data['P'] == 2) & (data['S10'].isin([3, 8, 9, 12, 13]))].copy()

    # Create a new column 'validation_03' and initialize it with 1 (pass)
    data['KONSISTENSI_TC16b'] = 1

    # Mark the rows that fail the condition as 0 (fail)
    data.loc[filtered_data_TC16b.index, 'KONSISTENSI_TC16b'] = 0

    return data

result = validate_condition_TC16b(df)
                               
################################################################################

def validate_condition_TC16c(data):
    # Apply the condition filters
    filtered_data_TC16c = data[(data['P'] == 4) & (data['S10'].isin([1, 7]))].copy()

    # Create a new column 'validation_03' and initialize it with 1 (pass)
    data['KONSISTENSI_TC16c'] = 1

    # Mark the rows that fail the condition as 0 (fail)
    data.loc[filtered_data_TC16c.index, 'KONSISTENSI_TC16c'] = 0

    return data

result = validate_condition_TC16c(df)

In [None]:
df['S8'].value_counts()

In [None]:
df['KONSISTENSI_TC16c'].value_counts()

In [None]:
def print_column_value_counts(df):
    for column in df.columns:
        if column.startswith('KONSISTENSI'):
            value_counts = df[column].value_counts()
            total_count = value_counts.sum()
            print(f"Column: {column}")
            for value, count in value_counts.items():
                percentage = (count / total_count) * 100
                result = value  # Display the value as is
                print(f"Result: {result}, Count: {count}, Percentage: {percentage:.2f}%")
            print()  # Add an empty line between columns

print_column_value_counts(df)

In [None]:
df.head()

In [None]:
print(df['RIN'].unique())
print(df['RIN'].dtypes)

In [None]:
df_01 = df[df['RIN'] == '1']

In [None]:
df_01['RIN'].value_counts()

In [None]:
df['NOIC'].value_counts()

In [None]:
df['NOIC'].isnull()

In [None]:
def check_data(df):
    # Filter the data to get only rows that contain 'RIN' == 1
    df = df[df['RIN'] == '1'].copy()

    # Iterate over every row and check the conditions
    results = []
    for _, row in df.iterrows():
        s8_result = row['S8'] in [1000, 200, 30, 4, 1004, 204, 34] if not np.isnan(row['S8']) else False
        s18_result = len(str(row['S18'])) == 6
        s19_result = len(str(row['S19'])) == 5
        s20_result = row['S20'] in range(1, 7) if not np.isnan(row['S20']) else False

        result = (
            (row['S1'] == 1) and
            (len(str(int(row['S3']))) == 2 and row['S3'] >= 30) and
            (row['S7'] in [1, 2]) and
            s8_result and
            s18_result and
            s19_result and
            s20_result and
            (row['S21'] in range(1, 4)) and
            (row['S22'] in range(1, 4)) and
            (row['S23'] in range(1, 12)) and
            (row['S24'] in [1, 2]) and
            (row['S34'] in [1, 2])
        )
        results.append(int(result))

    # Add the results as a new column
    df['combined_result'] = results

    return df

df = check_data(df)

In [None]:
df['combined_result'].value_counts()

In [None]:
df_rin_1 = df[df['combined_result'] == 0]

In [None]:
df_rin_1[['S1', 'S3', 'S7', 'S8', 'S18', 'S19', 'S20', 'S21', 'S22', 'S23', 'S24', 'S34', 'combined_result']].sample(10)

In [None]:
# Extract the original file name
original_file_name = os.path.basename(dataset_path)

suffix = '_konsistensi'

# Create the new file name by adding the suffix and changing the extension to .xlsx
new_file_name = original_file_name.replace('.xlsx', '') + suffix + '.xlsx'

# Save the DataFrame to Excel using the new file name
df.to_excel(os.path.join(output_file_path, new_file_name), index=False)

In [None]:
# Extract the original file name
original_file_name = os.path.basename(dataset_path)

suffix = '_konsistensi'

# Create the new file name by adding the suffix
new_file_name = original_file_name.replace('.csv', '') + suffix + '.csv'

# Save the DataFrame as CSV using the new file name
df.to_csv(os.path.join(output_file_path, new_file_name), index=False)

In [None]:
df.head(5)