In [1]:
import pandas as pd
import numpy as np
from openpyxl import load_workbook
import yaml
import math
import datetime
import os

In [2]:
base_path = r'C:/Users/amnar/Desktop/gh_konsistensi/'

# File paths
dataset_path = base_path + 'data/sgu/JR5M10Y2021_DATA RAW.xlsx'
reference_files = ['ref/oku.csv',
                   'ref/kewarganegaraan.csv',
                   'ref/kumpulan_etnik.csv', 
                   'ref/persekolahan.csv',
                   'ref/pendidikan_rasmi.csv',
                   'ref/pendidikan_rasmi_tertinggi_2022.csv', 
                   'ref/sijil_tertinggi.csv',
                   'ref/sijil_tertinggi_2022.csv',
                   'ref/status_code.csv', 
                   'ref/msic_code_detail_01.csv',
                   'ref/masco_code.csv',
                   'ref/negara_code.csv', 
                   'ref/institusi_pengajian.csv',
                   'ref/bidang_pengajian.csv']

file_paths = [dataset_path] + [base_path + file for file in reference_files]

In [3]:
def read_files(*file_paths):
    def read_csv(file_path):
        return pd.read_csv(file_path, encoding='unicode_escape', low_memory=False)

    def read_excel(file_path):
        workbook = load_workbook(filename=file_path)
        sheet_name = workbook.sheetnames[0]  # Get the name of the first sheet
        worksheet = workbook[sheet_name]
        data = list(worksheet.values)
        return pd.DataFrame(data[1:], columns=data[0])

    file_readers = {
        'csv': read_csv,
        'xlsx': read_excel,
        'xls': read_excel
    }
    
    predefined_names = {
        'JR5M10Y2021_DATA RAW.xlsx': 'df',
        'oku.csv': 'df_oku',
        'kewarganegaraan.csv': 'df_kw',
        'kumpulan_etnik.csv': 'df_ket',
        'persekolahan.csv': 'df_persk',
        'pendidikan_rasmi.csv': 'df_pend',
        'pendidikan_rasmi_tertinggi_2022.csv': 'df_pend_22',
        'sijil_tertinggi.csv': 'df_sijil',
        'sijil_tertinggi_2022.csv': 'df_sijil_22',
        'status_code.csv': 'df_status',
        'msic_code_detail_01.csv': 'df_msic',
        'masco_code.csv': 'df_masco',
        'negara_code.csv': 'df_ngra',
        'institusi_pengajian.csv': 'df_ip',
        'bidang_pengajian.csv': 'df_fs'
    }
    
    for file_path in file_paths:
        filename = file_path.split('/')[-1]
        df_name = predefined_names[filename]
        file_format = filename.split('.')[-1]
        
        # Declare the dataframe name as global
        globals()[df_name] = file_readers[file_format](file_path)

# Call the function
read_files(*file_paths)

In [10]:
df['G3'].value_counts(dropna=False)

G3
01261    341
85211    319
85103    316
41001    305
86101    233
        ... 
96091      1
01492      1
16221      1
51102      1
08999      1
Name: count, Length: 619, dtype: int64

In [14]:
g3_list = df_msic['KOD_MSIC'].tolist()
g3_list

[1111,
 1112,
 1113,
 1119,
 1120,
 1131,
 1132,
 1133,
 1134,
 1135,
 1136,
 1137,
 1138,
 1138,
 1140,
 1150,
 1160,
 1191,
 1192,
 1193,
 1193,
 1199,
 1210,
 1221,
 1222,
 1223,
 1224,
 1225,
 1226,
 1227,
 1228,
 1229,
 1231,
 1232,
 1233,
 1239,
 1241,
 1249,
 1251,
 1252,
 1253,
 1259,
 1259,
 1261,
 1262,
 1263,
 1269,
 1271,
 1272,
 1273,
 1279,
 1281,
 1282,
 1283,
 1284,
 1285,
 1289,
 1291,
 1292,
 1293,
 1294,
 1295,
 1296,
 1299,
 1301,
 1301,
 1302,
 1303,
 1304,
 1304,
 1411,
 1411,
 1412,
 1413,
 1420,
 1430,
 1441,
 1442,
 1443,
 1450,
 1461,
 1462,
 1463,
 1464,
 1465,
 1466,
 1467,
 1468,
 1469,
 1491,
 1492,
 1493,
 1494,
 1495,
 1496,
 1497,
 1499,
 1500,
 1610,
 1620,
 1620,
 1631,
 1632,
 1632,
 1633,
 1634,
 1640,
 1701,
 1702,
 2101,
 2102,
 2103,
 2104,
 2105,
 2201,
 2202,
 2203,
 2204,
 2301,
 2302,
 2303,
 2309,
 2309,
 2309,
 2401,
 2401,
 2402,
 3111,
 3112,
 3113,
 3114,
 3115,
 3119,
 3121,
 3122,
 3123,
 3124,
 3129,
 3211,
 3212,
 3213,
 3214,
 3215,

In [None]:
import time

start_time = time.time()

In [None]:
### Semakan Julat JR2 ###

dr2_list = list(range(1,3))
g1_list = list(range(1,3))
g2_list = df_masco['KOD_MASCO'].tolist()
g3_list = df_msic['KOD_MSIC'].tolist()
g4a_list = list(range(1,7))
g5a_range = set(str(i).zfill(5) for i in range(1, 100000))
g5b_range = [str(i).zfill(2) for i in range(1, 13)]
g6_range = [str(i).zfill(2) for i in range(1, 25)]
g7_range = [str(i).zfill(2) for i in range(1, 33)]
g8_range = [str(i).zfill(5) for i in range(1, 100000)]
g9_range = [str(i).zfill(5) for i in range(1, 100000)]
g10_range = [str(i).zfill(5) for i in range(1, 100000)]
g10_range = [str(i).zfill(5) for i in range(1, 100000)]
g10_range = [str(i).zfill(5) for i in range(1, 100000)]
g10a_range = [str(i).zfill(5) for i in range(1, 100000)]
g10b_range = [str(i).zfill(5) for i in range(1, 100000)]
g10c_range = [str(i).zfill(5) for i in range(1, 100000)]
g10d_range = [str(i).zfill(5) for i in range(1, 100000)]
g10e_range = [str(i).zfill(5) for i in range(1, 100000)]
g10f_range = [str(i).zfill(5) for i in range(1, 100000)]
g10g_range = [str(i).zfill(5) for i in range(1, 100000)]
g11_range = [str(i).zfill(5) for i in range(1, 100000)]
g11a_range = [str(i).zfill(5) for i in range(1, 100000)]
g11b_range = [str(i).zfill(5) for i in range(1, 100000)]
g12_range = [str(i).zfill(5) for i in range(1, 100000)]
g12a_range = [str(i).zfill(5) for i in range(1, 100000)]
g12b_range = [str(i).zfill(5) for i in range(1, 100000)]
g12c_range = [str(i).zfill(5) for i in range(1, 100000)]
g13_range = [str(i).zfill(5) for i in range(1, 100000)]
g14_range = [str(i).zfill(5) for i in range(1, 100000)]
g15_list = list(range(1,3))

def validate_all_julats(row):
    row['JULAT_DR2'] = 1 if row['DR2'] in dr2_list else 0
    row['JULAT_G1'] = 1 if row['G1'] in g1_list else 0
    row['JULAT_G2'] = 1 if row['G2'] in g2_list else 0
    row['JULAT_G3'] = 1 if row['G3'] in g3_list else 0
    row['JULAT_G4A'] = 1 if row['G4a'] in g4a_list else 0
    row['JULAT_G4B'] = 1 if pd.isnull(row['G4b']) or 0.00 <= float(row['G4b']) <= 99999.00 else 0
    row['JULAT_G5A'] = 1 if pd.isnull(row['G5a']) or str(row['G5a']).zfill(5) == '00000' or str(row['G5a']).zfill(5) in g5a_range else 0     
    row['JULAT_G5B'] = 1 if pd.isnull(row['G5b']) or str(row['G5b']).zfill(2) == '00' or str(row['G5b']).zfill(2) in g5b_range else 0        
    row['JULAT_G6'] = 1 if pd.isnull(row['G6']) or str(row['G6']).zfill(2) == '00' or str(row['G6']).zfill(2) in g6_range else 0
    row['JULAT_G7'] = 1 if pd.isnull(row['G7']) or str(row['G7']).zfill(2) == '00' or str(row['G7']).zfill(2) in g7_range else 0     
    row['JULAT_G8'] = 1 if pd.isnull(row['G8']) or str(row['G8']).zfill(5) == '00000' or str(row['G8']).zfill(5) in g8_range else 0     
    row['JULAT_G9'] = 1 if pd.isnull(row['G9']) or str(row['G9']).zfill(5) == '00000' or str(row['G9']).zfill(5) in g9_range else 0
    row['JULAT_G10'] = 1 if pd.isnull(row['G10']) or str(row['G10']).zfill(5) == '00000' or str(row['G10']).zfill(5) in g10_range else 0       
    row['JULAT_G10a'] = 1 if pd.isnull(row['G10a']) or str(row['G10a']).zfill(5) == '00000' or str(row['G10a']).zfill(5) in g10a_range else 0    
    row['JULAT_G10b'] = 1 if pd.isnull(row['G10b']) or str(row['G10b']).zfill(5) == '00000' or str(row['G10b']).zfill(5) in g10b_range else 0
    row['JULAT_G10c'] = 1 if pd.isnull(row['G10c']) or str(row['G10c']).zfill(5) == '00000' or str(row['G10c']).zfill(5) in g10c_range else 0
    row['JULAT_G10d'] = 1 if pd.isnull(row['G10d']) or str(row['G10d']).zfill(5) == '00000' or str(row['G10d']).zfill(5) in g10d_range else 0
    row['JULAT_G10e'] = 1 if pd.isnull(row['G10e']) or str(row['G10e']).zfill(5) == '00000' or str(row['G10e']).zfill(5) in g10e_range else 0
    row['JULAT_G10f'] = 1 if pd.isnull(row['G10f']) or str(row['G10f']).zfill(5) == '00000' or str(row['G10f']).zfill(5) in g10f_range else 0
    row['JULAT_G10g'] = 1 if pd.isnull(row['G10g']) or str(row['G10g']).zfill(5) == '00000' or str(row['G10g']).zfill(5) in g10g_range else 0
    row['JULAT_G11'] = 1 if pd.isnull(row['G11']) or str(row['G11']).zfill(5) == '00000' or str(row['G11']).zfill(5) in g11_range else 0
    row['JULAT_G11A'] = 1 if pd.isnull(row['G11a']) or str(row['G11a']).zfill(5) == '00000' or str(row['G11a']).zfill(5) in g11a_range else 0
    row['JULAT_G11B'] = 1 if pd.isnull(row['G11b']) or str(row['G11b']).zfill(5) == '00000' or str(row['G11b']).zfill(5) in g11b_range else 0
    row['JULAT_G12'] = 1 if pd.isnull(row['G12']) or str(row['G12']).zfill(5) == '00000' or str(row['G12']).zfill(5) in g12_range else 0
    row['JULAT_G12A'] = 1 if pd.isnull(row['G12a']) or str(row['G12a']).zfill(5) == '00000' or str(row['G12a']).zfill(5) in g12a_range else 0
    row['JULAT_G12B'] = 1 if pd.isnull(row['G12b']) or str(row['G12b']).zfill(5) == '00000' or str(row['G12b']).zfill(5) in g12b_range else 0
    row['JULAT_G12C'] = 1 if pd.isnull(row['G12c']) or str(row['G12c']).zfill(5) == '00000' or str(row['G12c']).zfill(5) in g12c_range else 0
    row['JULAT_G13'] = 1 if pd.isnull(row['G13']) or str(row['G13']).zfill(5) == '00000' or str(row['G13']).zfill(5) in g13_range else 0
    row['JULAT_G14'] = 1 if pd.isnull(row['G14']) or str(row['G14']).zfill(5) == '00000' or str(row['G14']).zfill(5) in g14_range else 0
    row['JULAT_G15'] = 1 if row['G15'] in g15_list else 0
    
    return row

df = df.apply(validate_all_julats, axis=1)

In [4]:
# Define the given data
S20 = [1, 4, 5, 6]
S19 = "97000"

# Create a new DataFrame excluding the specified conditions
filtered_df = df[~df['S'].isin(S20) & (df['S19'] != S19)]

In [5]:
msic_excpt_T = df_msic[df_msic['SEKSYEN'] == 'T']['ITEM'].tolist()

msic_excpt_T

KeyError: 'ITEM'

In [8]:
masco_03b = df_masco.loc[(df_masco['KOD_MASCO'] >= '111101') & (df_masco['KOD_MASCO'] <= '962903'), 'KOD_MASCO'].tolist()

masco_03b

TypeError: Invalid comparison between dtype=int64 and str

In [7]:
df_masco.head(5)

Unnamed: 0,KOD_MASCO,DESCRIPTION_ENGLISH,DESCRIPTION_MALAY
0,11101,Pegawai Perubatan,Medical Officer
1,11102,Pegawai Pergigian,Dental officer
2,11103,Pegawai Farmasi,Pharmacy Officer
3,12101,Pegawai Rejimen Askar Melayu DiRaja (RAMD),Royal Malay Regiment Officer
4,12102,Pegawai Rejimen Renjer DiRaja (RRD),Royal Ranger Regiment Officer


In [5]:
def validate_S20(df):
    """
    This function performs the initial validation on the given dataframe based on the 'S20' column values.
    
    The 'S20' column in the dataframe represents some specific criterion in the dataset. 
    For the purpose of this project, only rows where the 'S20' column has values "2", "3", or "4" are considered valid.
    Rows with 'S20' values of "1", "5", or "6" are excluded as they are deemed irrelevant for further processing.
    
    Parameters:
    - df (pd.DataFrame): The input dataframe containing raw data. 
                         Expected to have a column named 'S20' with string values.
    
    Returns:
    - df : pd.DataFrame: A new dataframe containing only the rows where 'S20' has values "2", "3", or "4". 
                    This dataframe is a copy, ensuring the original data remains unaltered.
    
    Usage:
    Given a dataframe 'df', the function can be invoked as:
    >>> filtered_df = validate_S20(df)
    """
    
    # List of valid string values for the 'S20' column for this project
    valid_values = ["2", "3", "4"]
    
    # Filter the dataframe to include only rows with valid 'S20' string values
    # The .copy() ensures that a new dataframe is returned, keeping the original dataframe unaltered
    filtered_df = df[df['S20'].isin(valid_values)].copy()
    
    return filtered_df

filtered_df = validate_S20(df)

In [None]:
### Semakan Logik (1a) ###

msic_excpt_T = df_msic[df_msic['SEKSYEN'] == 'T']['ITEM'].tolist()

# Main function
def validate_condition_01a(row):
    if (row['S20'] in [2, 3, 4]) and (any(row['S19'] for row['S19'] in msic_excpt_T)):    
        return 1
    else:
        return 0

filtered_df['LOGIK_01a'] = filtered_df.apply(validate_condition_01a, axis=1)
filtered_df['LOGIK_01a'].value_counts()

In [None]:
### Semakan Logik (1b) ###

# Main function
def validate_condition_01b(row):
    if row['S20'] in [2, 3, 4] and row['S19'] in msic_excpt_T:
        return 1
    else:
        return 0
    
filtered_df['LOGIK_01b'] = filtered_df.apply(validate_condition_01b, axis=1)
filtered_df['LOGIK_01b'].value_counts()

In [20]:
filtered_df[['G1', 'G2', 'G4a']].sample(15)

Unnamed: 0,G1,G2,G4a
381,1,233102,4.0
5893,1,233102,4.0
1043,1,233102,4.0
1449,1,441906,4.0
51,1,234102,4.0
1078,1,541411,4.0
2710,1,524203,4.0
6220,1,921106,4.0
2717,1,411101,4.0
1308,1,921115,4.0


In [21]:
import pandas as pd

def apply_LOGIK_01_validation(df):
    """
    Applies the LOGIK_01 validation on the given dataframe.
    
    Validation rules:
    - If 'G1' != 1: Considered as True.
    - If 'G1' == 1: 
        - 'G2' and 'G3' must be empty (NaN).
        - 'G4' must have a value (not NaN).
        
    The result of the validation is stored in a new column named 'LOGIK_01'.
    
    Parameters:
    - df (pd.DataFrame): The input dataframe, which has passed the initial 'S20' validation.
    
    Returns:
    - pd.DataFrame: The dataframe with an added 'LOGIK_01' column containing the validation results.
    
    Usage:
    Given a dataframe 'df', the function can be invoked as:
    >>> df_with_LOGIK_01 = apply_LOGIK_01_validation(df)
    """
    
    # Define the validation logic for rows where G1 == 1
    condition_for_G1_equals_1 = (
        (df['G1'] == 1) & 
        (df['G2'].isna() | (df['G2'] == '')) & 
        (df['G3'].isna() | (df['G3'] == '')) & 
        (~df['G4a'].isna() & (df['G4a'] != ''))
    )
    
    # Combine the two conditions: rows with G1 values other than 1 are true, 
    # and rows with G1 == 1 are validated based on the above condition
    final_conditions = (df['G1'] != 1) | condition_for_G1_equals_1
    
    # Assign 1 for True and 0 for False based on the final conditions
    df['LOGIK_01'] = final_conditions.astype(int)
    
    return df

filtered_df = apply_LOGIK_01_validation(df)

In [22]:
df['LOGIK_01'].value_counts()

LOGIK_01
1    7357
Name: count, dtype: int64

In [None]:
### Semakan Logik (3a) ###

# Main function
def validate_condition_03a(row):
    if (any(row['G2'] for row['G2'] in masco_03b) and (any(row['G3'] for row['G3'] in msic_excpt_T))):    
        return 1
    else:
        return 0

filtered_df['LOGIK_03a'] = filtered_df.apply(validate_condition_03a, axis=1)
filtered_df['LOGIK_03a'].value_counts()

In [None]:
### Semakan Logik (3b) ###

# Main function
def validate_condition_03b(row):
    if row['G2'] in masco_03b and row['G3'] in msic_excpt_T:
        return 1
    else:
        return 0
    
filtered_df['LOGIK_03b'] = filtered_df.apply(validate_condition_03b, axis=1)
filtered_df['LOGIK_03b'].value_counts()

In [None]:
### Semakan Logik (4a) ###

# Main function
def validate_condition_04a(row):
    if not pd.isna(row['G4a']) and not pd.isna(row['G14']):
        return 1
    else:
        return 0

filtered_df['LOGIK_04a'] = filtered_df.apply(validate_condition_04a, axis=1)
filtered_df['LOGIK_04a'].value_counts()

In [None]:
# Main function
def validate_condition_04b_01(row):
    if row['G4a'] in [1, 2, 5] and pd.notna(row['G4b']):
        return 1
    else:
        return 0

filtered_df['LOGIK_04b_01'] = filtered_df.apply(validate_condition_04b_01, axis=1)
filtered_df['LOGIK_04b_01'].value_counts()

In [None]:
# Main function
def validate_condition_04b_02(row):
    if row['G4a'] in [3, 4, 6] and pd.isna(row['G4b']):
        return 1
    else:
        return 0

filtered_df['LOGIK_04b_02'] = filtered_df.apply(validate_condition_04b_02, axis=1)
filtered_df['LOGIK_04b_02'].value_counts()

In [None]:
### Semakan Logik (4b) ###

def validate_condition_04b(row):
    if row['G4a'] in [1, 2, 5] and pd.notna(row['G4b']) and pd.notna(row['G4b']):
        return 1  # Pass - Condition 1
    elif row['G4a'] in [3, 4, 6] and pd.notna(row['G4b']) and pd.isna(row['G4b']):
        return 2  # Fail - Condition 2
    else:
        return 3  # Fail - Condition 3

filtered_df['LOGIK_04b'] = filtered_df.apply(validate_condition_04b, axis=1)
filtered_df['LOGIK_04b'].value_counts()

In [None]:
df_04b_1 = filtered_df[filtered_df['LOGIK_04b'] == 1]
df_04b_2 = filtered_df[filtered_df['LOGIK_04b'] == 2]
df_04b_3 = filtered_df[filtered_df['LOGIK_04b'] == 3]

In [None]:
df_04b_3[['G4a', 'G4b']].sample(10)

In [None]:
### Semakan Logik (5) ###

# Main function
def validate_condition_05(row):
    if row['G4a'] in [1, 2, 3, 4, 6] and pd.isna(row['G5a']) and pd.isna(row['G5b']) \
    and not pd.isna(row['G6']) and not pd.isna(row['G7']):
        return 1
    else:
        return 0

filtered_df['LOGIK_05'] = filtered_df.apply(validate_condition_05, axis=1)
filtered_df['LOGIK_05'].value_counts()

In [None]:
df_05_1 = filtered_df[filtered_df['LOGIK_05'] == 1]
df_05_0 = filtered_df[filtered_df['LOGIK_05'] == 0]

In [None]:
df_05_0[['G4a', 'G5a', 'G5b', 'G6', 'G7']].sample(10)

In [None]:
### Semakan Logik (6) ###

# Main function
def validate_condition_06(row):
    if row['G4a'] == 5:
        required_columns = ['G5a', 'G5b', 'G6', 'G7']
        if all(pd.notna(row[col]) for col in required_columns):
            return 1  # Pass - Condition met
    return 0  # Fail - Condition not met

filtered_df['LOGIK_06'] = filtered_df.apply(validate_condition_06, axis=1)
filtered_df['LOGIK_06'].value_counts()

In [None]:
filtered_df['G12'].value_counts()

In [None]:
filtered_df[['G8', 'G9', 'G10', 'G11', 'G12']].sample(10)

In [None]:
### Semakan Logik (8) ###

# Main function
def validate_condition_08a(row):
    if row['G8'] == row['G9'] + row['G10'] + row['G11'] + row['G12']:
        return 1  # Pass - Condition met
    return 0  # Fail - Condition not met

filtered_df['LOGIK_08a'] = filtered_df.apply(validate_condition_08a, axis=1)
filtered_df['LOGIK_08a'].value_counts()

In [None]:
### Semakan Logik (8) ###

# Main function
def validate_condition_08b(row):
    g8 = row['G8']
    g9 = row['G9']
    g10 = row['G10']
    g11 = row['G11']
    g12 = row['G12']

    if isinstance(g8, float) and math.isnan(g8):
        return 0  # Fail - NaN value in column 'G8'
    
    if int(g8) == (int(g9) if pd.notna(g9) else 0) + \
                 (int(g10) if pd.notna(g10) else 0) + \
                 (int(g11) if pd.notna(g11) else 0) + \
                 (int(g12) if pd.notna(g12) else 0):
        return 1  # Pass - Condition met
    
    return 0  # Fail - Condition not met

filtered_df['LOGIK_08b'] = filtered_df.apply(validate_condition_08b, axis=1)
filtered_df['LOGIK_08b'].value_counts()

In [None]:
### Semakan Logik (8) ###

# Main function
def validate_condition_08c(row):
    g8 = row['G8']
    
    if pd.isna(g8):
        return 1  # Pass - NaN value in column 'G8'

    if int(g8) > 2000:
        if row['PT'] in [3, 4]:
            return 1  # Pass - Condition met
    
    return 0  # Fail - Condition not met

filtered_df['LOGIK_08c'] = filtered_df.apply(validate_condition_08c, axis=1)
filtered_df['LOGIK_08c'].value_counts()

In [None]:
### Semakan Logik (8) ###

# Main function
def validate_condition_08d(row):
    g8 = row['G8']
    
    if pd.isna(g8):
        return 1  # Pass - NaN value in column 'G8'

    if int(g8) > 20000:
        if row['G2'] in [1, 2, 3]:
            return 1  # Pass - Condition met
    
    return 0  # Fail - Condition not met

filtered_df['LOGIK_08d'] = filtered_df.apply(validate_condition_08d, axis=1)
filtered_df['LOGIK_08d'].value_counts()

In [None]:
### Semakan Logik (8) ###

# Main function
def validate_condition_08e(row):
    g8 = row['G8']

    if pd.isna(g8):
        return 1  # Pass - NaN value in column 'G8'
    
    if int(g8) < 100:
        return 1  # Pass - Condition met

    return 0  # Fail - Condition not met

filtered_df['LOGIK_08e'] = filtered_df.apply(validate_condition_08e, axis=1)
filtered_df['LOGIK_08e'].value_counts()

In [None]:
### Semakan Logik (9) ###

# Main function
def validate_condition_09(row):
    if row['S20'] == 1 and row['G1'] == 1:
        if pd.notna(row['G9']):
            return 0  # Fail - Condition met, but G9 has a value
        else:
            return 1  # Pass - Condition met, G9 does not have a value
    else:
        return 1  # Pass - Conditions not met

filtered_df['LOGIK_09'] = filtered_df.apply(validate_condition_09, axis=1)
filtered_df['LOGIK_09'].value_counts()

In [None]:
### Semakan Logik (10) ###

# Main function
def validate_condition_10(row):
    g10 = row['G10']
    sum_g10 = sum(row[['G10a', 'G10b', 'G10c', 'G10d', 'G10e', 'G10f', 'G10g']].dropna())

    if pd.notna(g10) and g10 == sum_g10:
        return 1  # Pass - Condition met
    
    return 0  # Fail - Condition not met

filtered_df['LOGIK_10'] = filtered_df.apply(validate_condition_10, axis=1)
filtered_df['LOGIK_10'].value_counts()

In [None]:
### Semakan Logik (11) ###

# Main function
def validate_condition_11(row):
    g11 = row['G11']
    sum_g11 = sum(row[['G11a', 'G11b']].dropna())

    if pd.notna(g11) and g11 == sum_g11:
        return 1  # Pass - Condition met
    
    return 0  # Fail - Condition not met

filtered_df['LOGIK_11'] = filtered_df.apply(validate_condition_11, axis=1)
filtered_df['LOGIK_11'].value_counts()

In [None]:
### Semakan Logik (12) ###

# Main function
def validate_condition_12(row):
    g12 = row['G12']
    sum_g12 = sum(row[['G12a', 'G12b', 'G12c']].dropna())

    if pd.notna(g12) and g12 == sum_g12:
        return 1  # Pass - Condition met
    
    return 0  # Fail - Condition not met

filtered_df['LOGIK_12'] = filtered_df.apply(validate_condition_12, axis=1)
filtered_df['LOGIK_12'].value_counts()

In [None]:
### Semakan Logik (13) ###

# Main function
def validate_condition_13(row):
    g14 = row['G14']
    sum_g14 = sum(row[['G8', 'G13']].dropna())

    if pd.notna(g14) and g14 == sum_g14:
        return 1  # Pass - Condition met
    
    return 0  # Fail - Condition not met

filtered_df['LOGIK_13'] = filtered_df.apply(validate_condition_13, axis=1)
filtered_df['LOGIK_13'].value_counts()

In [None]:
### Semakan Logik (14) ###

# Main function
def validate_condition_14(row):
    g9 = row['G9']
    product_g9 = np.product(row[['G4b', 'G6', 'G7']].dropna().astype(float))

    if pd.notna(g9) and g9 == str(product_g9):
        return 1  # Pass - Condition met
    
    return 0  # Fail - Condition not met

filtered_df['LOGIK_14'] = filtered_df.apply(validate_condition_14, axis=1)
filtered_df['LOGIK_14'].value_counts()

In [None]:
### Semakan Logik (15) ###

# Main function
def validate_condition_15(row):
    g9 = row['G9']
    product_g9 = np.product(row[['G4b', 'G7']].dropna().astype(int))

    if pd.notna(g9) and g9 == str(product_g9):
        return 1  # Pass - Condition met
    
    return 0  # Fail - Condition not met

filtered_df['LOGIK_15'] = filtered_df.apply(validate_condition_15, axis=1)
filtered_df['LOGIK_15'].value_counts()

In [None]:
### Semakan Logik (16) ###

# Main function
def validate_condition_16(row):
    g9 = row['G9']
    product_g9 = np.product(row[['G4a', 'G5a']].dropna().astype(int))

    if pd.notna(g9) and g9 == str(product_g9):
        return 1  # Pass - Condition met
    
    return 0  # Fail - Condition not met

filtered_df['LOGIK_16'] = filtered_df.apply(validate_condition_16, axis=1)
filtered_df['LOGIK_16'].value_counts()

In [None]:
### Semakan Logik (17) ###

# Main function
def validate_condition_17(row):
    if pd.notna(row['G6']) and pd.notna(row['G7']):
        if pd.isna(row['G8']) or pd.isna(row['G14']):
            return 0  # Fail - Condition not met
        else:
            return 1  # Pass - Condition met
    else:
        return 1  # Pass - Condition met if 'G6' and 'G7' are NaN

filtered_df['LOGIK_17'] = filtered_df.apply(validate_condition_17, axis=1)
filtered_df['LOGIK_17'].value_counts()

In [None]:
### Semakan Logik (18a) ###

# Main function
def validate_condition_18a(row):
    if pd.notna(row['G6']) and int(row['G6']) < 6:
        return 0  # Fail - Condition not met
    else:
        return 1  # Pass - Condition met


filtered_df['LOGIK_18a'] = filtered_df.apply(validate_condition_18a, axis=1)
filtered_df['LOGIK_18a'].value_counts()

In [None]:
### Semakan Logik (18b) ###

# Main function
def validate_condition_18b(row):
    if pd.notna(row['G7']) and int(row['G7']) < 20:
        return 0  # Fail - Condition not met
    else:
        return 1  # Pass - Condition met


filtered_df['LOGIK_18b'] = filtered_df.apply(validate_condition_18b, axis=1)
filtered_df['LOGIK_18b'].value_counts()

In [None]:
### Semakan Logik (19) ###

# Main function
def validate_condition_19(row):
    if pd.notna(row['G7']) and int(row['G7']) < 20:
        return 0  # Fail - Condition not met
    else:
        return 1  # Pass - Condition met


filtered_df['LOGIK_19'] = filtered_df.apply(validate_condition_19, axis=1)
filtered_df['LOGIK_19'].value_counts()

In [None]:
### Semakan Logik (20) ###

# Main function
def validate_condition_20(row):
    if pd.notna(row['G7']) and int(row['G7']) < 20:
        return 0  # Fail - Condition not met
    else:
        return 1  # Pass - Condition met


filtered_df['LOGIK_20'] = filtered_df.apply(validate_condition_20, axis=1)
filtered_df['LOGIK_20'].value_counts()

In [None]:
end_time = time.time()
execution_time = end_time - start_time

minutes = int(execution_time // 60)
seconds = int(execution_time % 60)

print("Total execution time: {} minutes {} seconds".format(minutes, seconds))

In [None]:
def print_column_value_counts(df):
    for column in df.columns:
        if column.startswith('JULAT'):
            value_counts = df[column].value_counts()
            total_count = value_counts.sum()
            print(f"Column: {column}")
            for value, count in value_counts.items():
                percentage = (count / total_count) * 100
                result = value  # Display the value as is
                print(f"Result: {result}, Count: {count}, Percentage: {percentage:.2f}%")
            print()  # Add an empty line between columns

print_column_value_counts(df)

In [None]:
# Save the dataframe as a CSV file
# df.to_csv(f'{csv_file_path}_new.csv', index=False)