## Fix data

In [None]:
import pandas as pd
import numpy as np

INPUT_FILE    = r"output_split_contract_period_raw.xlsx"
OUTPUT_FILE   = r"D:/DATA SKRIPSI/kontrak_sewa_bersih.xlsx"
SHEET_MONTHLY = "Monthly"
SHEET_DAILY   = "Daily"

def parse_number(val):
    """Parse string angka dengan berbagai format (EU/US)."""
    if pd.isna(val):
        return np.nan
    if isinstance(val, (int, float, np.integer, np.floating, np.number)):
        return float(val)
    s = str(val).strip().replace(' ', '')
    has_comma = ',' in s
    has_dot   = '.' in s
    try:
        if has_comma and not has_dot:
            return float(s.replace(',', '.'))
        elif has_dot and not has_comma:
            return float(s)
        elif has_comma and has_dot:
            last_comma = s.rfind(',')
            last_dot   = s.rfind('.')
            if last_comma > last_dot:  # 1.234,56 -> 1234.56
                return float(s.replace('.', '').replace(',', '.'))
            else:                      # 1,234.56 -> 1234.56
                return float(s.replace(',', ''))
        else:
            return float(s)
    except:
        return pd.to_numeric(s, errors='coerce')

def _safe_parse_column(series):
    if series.dtype == 'O':
        return series.apply(parse_number)
    return pd.to_numeric(series, errors='coerce').astype(float)

def count_digits(x):
    """Hitung jumlah digit bagian integer (abs)."""
    try:
        if pd.isna(x):
            return np.nan
        return len(str(int(abs(float(x)))))
    except:
        return np.nan

def _append_reason(s_old, new_reason):
    """Gabungkan alasan lama & baru dengan pemisah ' | '."""
    if not isinstance(s_old, str) or s_old is None:
        s_old = ""
    if s_old.strip() == "":
        return new_reason
    return s_old + " | " + new_reason

def fix_one_sheet(df, period_label):
    df = df.copy()

    for col in ["BuildingArea", "CuryUnitPrice", "LeaseDurationDays", "LeaseDurationMonths"]:
        if col in df.columns:
            df[col] = _safe_parse_column(df[col])

    df["BuildingArea_fix"] = df.get("BuildingArea", pd.Series([np.nan]*len(df)))
    df["area_fix_reason"]  = ""

    if "UnitID" in df.columns and "BuildingArea" in df.columns:
        area_ref = (
            df[df["BuildingArea"] > 1]
            .groupby("UnitID", dropna=False)["BuildingArea"]
            .max()
            .to_dict()
        )
        mask_area_one = (df["BuildingArea"] == 1) & df["UnitID"].isin(area_ref.keys())
        if mask_area_one.any():
            df.loc[mask_area_one, "BuildingArea_fix"] = df.loc[mask_area_one, "UnitID"].map(area_ref)
            df.loc[mask_area_one, "area_fix_reason"]  = df.loc[mask_area_one, "UnitID"].map(
                lambda u: f"copy_area_from_same_unit({u})"
            )

    print(f" [{period_label}] BuildingArea diperbaiki: {(df['area_fix_reason']!='').sum()} baris.")

    df["CuryUnitPrice_fix"] = df.get("CuryUnitPrice", pd.Series([np.nan]*len(df)))
    df["price_fix_reason"]  = ""

    if period_label.lower() == "monthly":
        dp = df["CuryUnitPrice_fix"].apply(count_digits)
        da = df["BuildingArea_fix"].apply(count_digits)
        done = pd.Series(False, index=df.index)

        area_eq1  = (df["BuildingArea_fix"] == 1)
        area_ne1  = ~area_eq1

        m9digits = (dp == 9) & (~done)

        m9_area1 = m9digits & area_eq1
        if m9_area1.any():
            df.loc[m9_area1, "CuryUnitPrice_fix"] = df.loc[m9_area1, "CuryUnitPrice_fix"] / 1000
            df.loc[m9_area1, "BuildingArea_fix"]  = df.loc[m9_area1, "BuildingArea_fix"]  * 1000
            df.loc[m9_area1, "price_fix_reason"]  = "9-digit ‚Üí price √∑1000 & area √ó1000 (area=1)"
            done |= m9_area1

        m9_ne1 = m9digits & area_ne1
        if m9_ne1.any():
            denom = df.loc[m9_ne1, "BuildingArea_fix"].replace(0, np.nan)
            df.loc[m9_ne1, "CuryUnitPrice_fix"] = df.loc[m9_ne1, "CuryUnitPrice_fix"] / denom
            df.loc[m9_ne1, "price_fix_reason"]  = "9-digit ‚Üí price √∑ area (area‚â†1)"
            done |= m9_ne1

        m8digits = (dp == 8) & (~done)

        m8_area1 = m8digits & area_eq1
        if m8_area1.any():
            df.loc[m8_area1, "CuryUnitPrice_fix"] = df.loc[m8_area1, "CuryUnitPrice_fix"] / 100
            df.loc[m8_area1, "BuildingArea_fix"]  = df.loc[m8_area1, "BuildingArea_fix"]  * 100
            df.loc[m8_area1, "price_fix_reason"]  = "8-digit ‚Üí price √∑100 & area √ó100 (area=1)"
            done |= m8_area1

        m8_ne1 = m8digits & area_ne1
        if m8_ne1.any():
            denom = df.loc[m8_ne1, "BuildingArea_fix"].replace(0, np.nan)
            df.loc[m8_ne1, "CuryUnitPrice_fix"] = df.loc[m8_ne1, "CuryUnitPrice_fix"] / denom
            df.loc[m8_ne1, "price_fix_reason"]  = "8-digit ‚Üí price √∑ area (area‚â†1)"
            done |= m8_ne1

        m7_ne1 = (dp == 7) & area_ne1 & (~done)
        if m7_ne1.any():
            df.loc[m7_ne1, "CuryUnitPrice_fix"] /= 10
            df.loc[m7_ne1, "price_fix_reason"]   = "√∑10 price (area‚â†1 & 7-digit)"
            done |= m7_ne1

        m7_area1 = (dp == 7) & area_eq1 & (~done)
        if m7_area1.any():
            df.loc[m7_area1, "CuryUnitPrice_fix"] /= 100
            df.loc[m7_area1, "BuildingArea_fix"]  *= 100
            df.loc[m7_area1, "price_fix_reason"]   = "√∑100 price & √ó100 area (7-digit, area=1)"
            done |= m7_area1

        da = df["BuildingArea_fix"].apply(count_digits)
        m_area_big_area1 = (da > 4) & area_eq1 & (~done)
        if m_area_big_area1.any():
            m1 = (dp == 1) & m_area_big_area1
            m2 = (dp == 2) & m_area_big_area1
            m3 = (dp == 3) & m_area_big_area1
            if m1.any():
                df.loc[m1, "CuryUnitPrice_fix"] *= 1000
                df.loc[m1, "price_fix_reason"]   = "√ó1000 (area>4 & price 1-digit, area=1)"
            if m2.any():
                df.loc[m2, "CuryUnitPrice_fix"] *= 100
                df.loc[m2, "price_fix_reason"]   = "√ó100 (area>4 & price 2-digit, area=1)"
            if m3.any():
                df.loc[m3, "CuryUnitPrice_fix"] *= 10
                df.loc[m3, "price_fix_reason"]   = "√ó10 (area>4 & price 3-digit, area=1)"
            done |= m1 | m2 | m3

        m_area_small = (da < 3) & (~done)
        if m_area_small.any():
            m4 = (dp == 4) & m_area_small
            m5 = (dp == 5) & m_area_small
            if m4.any():
                df.loc[m4, "CuryUnitPrice_fix"] *= 100
                df.loc[m4, "price_fix_reason"]   = "√ó100 (area<3 & price 4-digit)"
            if m5.any():
                df.loc[m5, "CuryUnitPrice_fix"] *= 10
                df.loc[m5, "price_fix_reason"]   = "√ó10 (area<3 & price 5-digit)"
            done |= m4 | m5

        base_mask = (~done)
        if base_mask.any():
            m1b = (dp == 1) & base_mask
            m2b = (dp == 2) & base_mask
            m3b = (dp == 3) & base_mask
            if m1b.any():
                df.loc[m1b, "CuryUnitPrice_fix"] *= 100000
                df.loc[m1b, "price_fix_reason"]   = "√ó100000 (baseline 1-digit)"
            if m2b.any():
                df.loc[m2b, "CuryUnitPrice_fix"] *= 10000
                df.loc[m2b, "price_fix_reason"]   = "√ó100000 (baseline 2-digit)"
            if m3b.any():
                df.loc[m3b, "CuryUnitPrice_fix"] *= 1000
                df.loc[m3b, "price_fix_reason"]   = "√ó1000 (baseline 3-digit)"

        # Monthly Post-check
        if "LeaseDurationMonths" in df.columns:
            valid_ldm = df["LeaseDurationMonths"].replace(0, np.nan).notna() & (df["LeaseDurationMonths"] > 1)
        else:
            valid_ldm = pd.Series(False, index=df.index)

        while True:
            dp = df["CuryUnitPrice_fix"].apply(count_digits)
            mask_post = (dp > 6) & valid_ldm

            if not mask_post.any():
                break  

            df.loc[mask_post, "CuryUnitPrice_fix"] = (
                df.loc[mask_post, "CuryUnitPrice_fix"] / df.loc[mask_post, "LeaseDurationMonths"]
            )

            df.loc[mask_post, "price_fix_reason"] = df.loc[mask_post, "price_fix_reason"].apply(
                lambda old: _append_reason(old, "monthly post-check loop: >6-digit ‚Üí price √∑ LeaseDurationMonths")
            )

        print(f" [Monthly] Skala harga diterapkan: {(df['price_fix_reason']!='').sum()} baris.")

    #Daily
    elif period_label.lower() == "daily":
        dp = df["CuryUnitPrice_fix"].apply(count_digits)
        valid_area = df["BuildingArea_fix"].replace(0, np.nan).notna()
        mask1 = (dp > 6) & (df["BuildingArea_fix"] > 1) & valid_area
        if mask1.any():
            df.loc[mask1, "CuryUnitPrice_fix"] = (
                df.loc[mask1, "CuryUnitPrice_fix"] / df.loc[mask1, "BuildingArea_fix"]
            )
            df.loc[mask1, "price_fix_reason"] = [
                _append_reason(old, "daily step1: >6-digit & area>1 ‚Üí price √∑ area")
                for old in df.loc[mask1, "price_fix_reason"]
            ]

        dp = df["CuryUnitPrice_fix"].apply(count_digits)
        mask2 = (dp > 6) & (df["BuildingArea_fix"] > 1)
        if "LeaseDurationDays" in df.columns:
            valid_ldd = df["LeaseDurationDays"].replace(0, np.nan).notna()
        else:
            valid_ldd = pd.Series(False, index=df.index)

        mask2a = mask2 & valid_ldd & (df["LeaseDurationDays"] > 2)
        if mask2a.any():
            df.loc[mask2a, "CuryUnitPrice_fix"] = (
                df.loc[mask2a, "CuryUnitPrice_fix"] / df.loc[mask2a, "LeaseDurationDays"]
            )
            df.loc[mask2a, "price_fix_reason"] = [
                _append_reason(old, "daily step2a: >6-digit & area>1 & LDD>2 ‚Üí price √∑ LDD")
                for old in df.loc[mask2a, "price_fix_reason"]
            ]

        dp = df["CuryUnitPrice_fix"].apply(count_digits)
        mask2b = (dp > 6) & (df["BuildingArea_fix"] > 1) & (~mask2a)
        if mask2b.any():
            m9 = mask2b & (dp == 9)
            m8 = mask2b & (dp == 8)
            m7 = mask2b & (dp == 7)
            m6 = mask2b & (dp == 6)

            if m9.any():
                df.loc[m9, "CuryUnitPrice_fix"] /= 10000
                df.loc[m9, "BuildingArea_fix"]  *= 10000
                df.loc[m9, "price_fix_reason"] = [
                    _append_reason(old, "daily step2b: 9-digit ‚Üí price √∑10000 & area √ó10000")
                    for old in df.loc[m9, "price_fix_reason"]
                ]
            if m8.any():
                df.loc[m8, "CuryUnitPrice_fix"] /= 1000
                df.loc[m8, "BuildingArea_fix"]  *= 1000
                df.loc[m8, "price_fix_reason"] = [
                    _append_reason(old, "daily step2b: 8-digit ‚Üí price √∑1000 & area √ó1000")
                    for old in df.loc[m8, "price_fix_reason"]
                ]
            if m7.any():
                df.loc[m7, "CuryUnitPrice_fix"] /= 100
                df.loc[m7, "BuildingArea_fix"]  *= 100
                df.loc[m7, "price_fix_reason"] = [
                    _append_reason(old, "daily step2b: 7-digit ‚Üí price √∑100 & area √ó100")
                    for old in df.loc[m7, "price_fix_reason"]
                ]
            if m6.any():
                df.loc[m6, "CuryUnitPrice_fix"] /= 10
                df.loc[m6, "BuildingArea_fix"]  *= 10
                df.loc[m6, "price_fix_reason"] = [
                    _append_reason(old, "daily step2b: 6-digit ‚Üí price √∑10 & area √ó10")
                    for old in df.loc[m6, "price_fix_reason"]
                ]


        # Daily check
        dp = df["CuryUnitPrice_fix"].apply(count_digits)
        mask3 = (dp >= 6)
        if mask3.any():
            m9 = mask3 & (dp == 9)
            m8 = mask3 & (dp == 8)
            m7 = mask3 & (dp == 7)
            m6 = mask3 & (dp == 6)

            if m9.any():
                df.loc[m9, "CuryUnitPrice_fix"] /= 10000
                df.loc[m9, "BuildingArea_fix"]  *= 10000
                df.loc[m9, "price_fix_reason"] = [
                    _append_reason(old, "daily step3: 9-digit ‚Üí price √∑10000 & area √ó10000")
                    for old in df.loc[m9, "price_fix_reason"]
                ]
            if m8.any():
                df.loc[m8, "CuryUnitPrice_fix"] /= 1000
                df.loc[m8, "BuildingArea_fix"]  *= 1000
                df.loc[m8, "price_fix_reason"] = [
                    _append_reason(old, "daily step3: 8-digit ‚Üí price √∑1000 & area √ó1000")
                    for old in df.loc[m8, "price_fix_reason"]
                ]
            if m7.any():
                df.loc[m7, "CuryUnitPrice_fix"] /= 100
                df.loc[m7, "BuildingArea_fix"]  *= 100
                df.loc[m7, "price_fix_reason"] = [
                    _append_reason(old, "daily step3: 7-digit ‚Üí price √∑100 & area √ó100")
                    for old in df.loc[m7, "price_fix_reason"]
                ]
            if m6.any():
                df.loc[m6, "CuryUnitPrice_fix"] /= 10
                df.loc[m6, "BuildingArea_fix"]  *= 10
                df.loc[m6, "price_fix_reason"] = [
                    _append_reason(old, "daily step3: 6-digit ‚Üí price √∑10 & area √ó10")
                    for old in df.loc[m6, "price_fix_reason"]
                ]


        print(f" [Daily] Skala harga diterapkan: {(df['price_fix_reason']!='').sum()} baris.")

  
    print(f"[{period_label}] Total baris: {len(df)} | AreaFix: {(df['area_fix_reason']!='').sum()} | PriceFix: {(df['price_fix_reason']!='').sum()}\n")
    return df

In [None]:
# EKSEKUSI
print("Memulai perbaikan BuildingArea dan CuryUnitPrice...\n")

monthly_df = pd.read_excel(INPUT_FILE, sheet_name=SHEET_MONTHLY)
daily_df   = pd.read_excel(INPUT_FILE, sheet_name=SHEET_DAILY)

monthly_fixed = fix_one_sheet(monthly_df, "Monthly")
daily_fixed   = fix_one_sheet(daily_df, "Daily")  

Memulai perbaikan BuildingArea dan CuryUnitPrice...

 [Monthly] BuildingArea diperbaiki: 517 baris.
 [Monthly] Skala harga diterapkan: 2209 baris.
[Monthly] Total baris: 3914 | AreaFix: 517 | PriceFix: 2209

 [Daily] BuildingArea diperbaiki: 37 baris.
 [Daily] Skala harga diterapkan: 2768 baris.
[Daily] Total baris: 2773 | AreaFix: 37 | PriceFix: 2768



In [None]:

with pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl", mode="w") as writer:
    monthly_fixed.to_excel(writer, index=False, sheet_name="Monthly_Fixed")
    daily_fixed.to_excel(writer, index=False, sheet_name="Daily_Fixed")
print(f"Selesai! File bersih disimpan sebagai:\n {OUTPUT_FILE}")

Selesai! File bersih disimpan sebagai:
 D:/DATA SKRIPSI/kontrak_sewa_bersih.xlsx


In [None]:


def _apply_fixes_inplace(df: pd.DataFrame) -> pd.DataFrame:
    """
    - Timpa CuryUnitPrice dengan CuryUnitPrice_fix (jika ada), lalu drop CuryUnitPrice_fix.
    - Timpa BuildingArea dengan BuildingArea_fix (jika ada), lalu drop BuildingArea_fix.
    - Hapus kolom 'area_fix_reason' dan 'price_fix_reason' (jika ada).
      Juga hapus kolom lain yang berakhiran '_fix_reason' (opsional, kalau ada).
    """
    df = df.copy()

    # ---- CuryUnitPrice ----
    if "CuryUnitPrice_fix" in df.columns:
        if "CuryUnitPrice" not in df.columns:
            df["CuryUnitPrice"] = np.nan
        df["CuryUnitPrice"] = df["CuryUnitPrice_fix"]
        df = df.drop(columns=["CuryUnitPrice_fix"])

    # ---- BuildingArea ----
    if "BuildingArea_fix" in df.columns:
        if "BuildingArea" not in df.columns:
            df["BuildingArea"] = np.nan
        df["BuildingArea"] = df["BuildingArea_fix"]
        df = df.drop(columns=["BuildingArea_fix"])

    
    drop_exact = {"area_fix_reason", "price_fix_reason"}
    drop_suffix = [c for c in df.columns if str(c).endswith("_fix_reason")]
    cols_to_drop = list((set(df.columns) & drop_exact) | set(drop_suffix))
    if cols_to_drop:
        df = df.drop(columns=cols_to_drop)

    return df

monthly_fixed_final = _apply_fixes_inplace(monthly_fixed)
daily_fixed_final   = _apply_fixes_inplace(daily_fixed)

with pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl", mode="w") as writer:
    monthly_fixed_final.to_excel(writer, index=False, sheet_name="Monthly_Fixed")
    daily_fixed_final.to_excel(writer, index=False, sheet_name="Daily_Fixed")

print("\nKolom *_fix telah diterapkan, kolom *_fix dihapus, dan kolom reason dihapus.")
print(f"File final tersimpan ulang di:\n {OUTPUT_FILE}")



Kolom *_fix telah diterapkan, kolom *_fix dihapus, dan kolom reason dihapus.
File final tersimpan ulang di:
 D:/DATA SKRIPSI/kontrak_sewa_bersih.xlsx


In [None]:
import pandas as pd
import os


OUTPUT_FILE     = r"D:/DATA SKRIPSI/kontrak_sewa_bersih.xlsx"
SHEET_MONTHLYFX = "Monthly_Fixed"
SHEET_INPUT     = "Monthly_Input"
SHEET_TARGET    = "Monthly_Target"
TARGET_COL      = "CuryUnitPrice_fix"   

if not os.path.exists(OUTPUT_FILE):
    raise FileNotFoundError(f"File tidak ditemukan: {OUTPUT_FILE}\nPastikan proses cleaning sebelumnya sudah menghasilkan file ini.")

try:
    monthly_fixed = pd.read_excel(OUTPUT_FILE, sheet_name=SHEET_MONTHLYFX)
except Exception as e:
    raise RuntimeError(f"Gagal membaca sheet '{SHEET_MONTHLYFX}' dari {OUTPUT_FILE}. Detail: {e}")

if TARGET_COL not in monthly_fixed.columns:
    alt_col = "CuryUnitPrice"
    if alt_col in monthly_fixed.columns:
        print(f"‚ö†Ô∏è Kolom '{TARGET_COL}' tidak ditemukan. Memakai '{alt_col}' sebagai target sementara.")
        target_series = monthly_fixed[alt_col].copy()
        TARGET_COL = alt_col  
    else:
        raise KeyError(f"Kolom target '{TARGET_COL}' dan fallback 'CuryUnitPrice' tidak ditemukan di '{SHEET_MONTHLYFX}'.")

else:
    target_series = monthly_fixed[TARGET_COL].copy()

id_cols = [c for c in ["UnitID", "ContractID", "MCNbr", "CustomerID"] if c in monthly_fixed.columns]
if not id_cols:
    monthly_fixed = monthly_fixed.reset_index().rename(columns={"index": "RowID"})
    id_cols = ["RowID"]

monthly_input = monthly_fixed.drop(columns=[c for c in [TARGET_COL] if c in monthly_fixed.columns], errors="ignore").copy()
monthly_target = monthly_fixed[id_cols].copy()
monthly_target[TARGET_COL] = target_series.values

with pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl", mode="w") as writer:
    monthly_fixed.to_excel(writer, index=False, sheet_name=SHEET_MONTHLYFX)
    monthly_input.to_excel(writer, index=False, sheet_name=SHEET_INPUT)
    monthly_target.to_excel(writer, index=False, sheet_name=SHEET_TARGET)

print("‚úÖ Berhasil! File ditimpa dengan sheet:")
print(f"  - {SHEET_MONTHLYFX} (tetap)")
print(f"  - {SHEET_INPUT} (input Monthly tanpa target)")
print(f"  - {SHEET_TARGET} (ID + target Monthly)")
print(f"üìÑ Lokasi: {OUTPUT_FILE}")


‚ö†Ô∏è Kolom 'CuryUnitPrice_fix' tidak ditemukan. Memakai 'CuryUnitPrice' sebagai target sementara.
‚úÖ Berhasil! File ditimpa dengan sheet:
  - Monthly_Fixed (tetap)
  - Monthly_Input (input Monthly tanpa target)
  - Monthly_Target (ID + target Monthly)
üìÑ Lokasi: D:/DATA SKRIPSI/kontrak_sewa_bersih.xlsx


In [None]:

import pandas as pd

file_path = "D:/DATA SKRIPSI/kontrak_sewa_bersih.xlsx"

sheets_to_process = ["Monthly_Fixed", "Daily_Fixed"]

for sheet in sheets_to_process:
    print(f"Memproses sheet: {sheet}")

    df = pd.read_excel(file_path, sheet_name=sheet)

    if "UnitID" not in df.columns:
        print(f"‚ö†Ô∏è Kolom 'UnitID' tidak ditemukan di sheet {sheet}, dilewati.")
        continue

    df["Building"] = df["UnitID"].astype(str).str[0:3]   
    df["UnitArea"] = df["UnitID"].astype(str).str[3:6]   
    df["UnitFloor"] = df["UnitID"].astype(str).str[6:8]  
    df["UnitNum"] = df["UnitID"].astype(str).str[8:]     

    # df = df.drop(columns=["UnitID"])

    with pd.ExcelWriter(
        file_path,
        engine="openpyxl",
        mode="a",
        if_sheet_exists="replace"
    ) as writer:
        df.to_excel(writer, sheet_name=sheet, index=False)

    print(f"Sheet {sheet} berhasil diperbarui.\n")

print("Semua sheet selesai diproses.")

Memproses sheet: Monthly_Fixed
Sheet Monthly_Fixed berhasil diperbarui.

Memproses sheet: Daily_Fixed
Sheet Daily_Fixed berhasil diperbarui.

Semua sheet selesai diproses.


In [None]:
import pandas as pd
file_path = "D:/DATA SKRIPSI/kontrak_sewa_bersih_clustered.xlsx"

sheets_to_process = ["Monthly_Fixed_clustered", "Daily_Fixed_clustered"]

for sheet in sheets_to_process:
    df = pd.read_excel(file_path, sheet_name=sheet)
    df['UnitID'] = df['Building'].astype(str) + df['UnitArea'].astype(str) + df['UnitFloor'].astype(str) + df['UnitNum'].astype(str)
    
    with pd.ExcelWriter(
        file_path,
        engine="openpyxl",
        mode="a",
        if_sheet_exists="replace"
    ) as writer:
        df.to_excel(writer, sheet_name=sheet, index=False)

    print(f"Sheet {sheet} berhasil diperbarui.\n")

print("Semua sheet selesai diproses.")

## nanti

In [None]:
import pandas as pd
import numpy as np

PATH = r"D:/DATA SKRIPSI/kontrak_sewa_bersih.xlsx"
SHEETS = ["Monthly_Fixed", "Daily_Fixed"]

def pick_col(df, name):
    return name + "_fix" if (name + "_fix") in df.columns else name

def price_per_m2_per_period(df, period_label, area_col, price_col):
    if period_label.lower().startswith("monthly"):
        denom = df[area_col] * df["LeaseDurationMonths"]
    else:  # daily
        denom = df[area_col] * (df["LeaseDurationMonths"] * 30.4375)
    denom = denom.replace({0: np.nan})
    return df[price_col] / denom

def iqr_flags(s):
    q1, q3 = s.quantile([0.25, 0.75])
    iqr = q3 - q1
    lo, hi = q1 - 1.5*iqr, q3 + 1.5*iqr
    return (s < lo) | (s > hi)

def check_sheet(name):
    print(f"\n================= {name} =================")

    df = pd.read_excel(PATH, sheet_name=name)
    area_col  = pick_col(df, "BuildingArea")
    price_col = pick_col(df, "CuryUnitPrice")

    df[area_col]  = pd.to_numeric(df[area_col], errors="coerce")
    df[price_col] = pd.to_numeric(df[price_col], errors="coerce")

    n = len(df)
    print(f"Rows: {n}")

    area = df[area_col]
    area_issues = {
        "NaN_area":           int(area.isna().sum()),
        "area<=0":            int((area<=0).sum()),
        "area==1":            int((area==1).sum()),
        "area<1.0 & >0":      int(((area<1) & (area>0)).sum()),
    }
    print("\n[BuildingArea sanity]")
    print(f"NaN: {area_issues['NaN_area']}, <=0: {area_issues['area<=0']}, ==1: {area_issues['area==1']}, (0..1): {area_issues['area<1.0 & >0']}")
    print("Area stats (valid):")
    print(area.replace([np.inf,-np.inf], np.nan).dropna().describe(percentiles=[.01,.05,.5,.95,.99]).to_string())

    if "UnitID" in df.columns:
        area_var = (df
            .groupby("UnitID")[area_col]
            .agg(n="count", uniq="nunique", std="std", min="min", max="max")
            .sort_values("uniq", ascending=False))
        multi_area_units = area_var[area_var["uniq"]>1]
        print(f"\nUnitID dengan variasi area >1 nilai: {len(multi_area_units)}")
        if len(multi_area_units) > 0:
            print(multi_area_units.head(10).to_string())

    price = df[price_col]
    print("\n[CuryUnitPrice sanity]")
    print(price.replace([np.inf,-np.inf], np.nan).dropna().describe(percentiles=[.01,.05,.5,.95,.99]).to_string())

    def count_digits(x):
        try:
            return len(str(int(float(x))))
        except:
            return np.nan
    digits = price.apply(count_digits)
    if name.lower().startswith("monthly"):
        digit_bad = (digits<5) | (digits>7)
    else:
        digit_bad = (digits<4) | (digits>6)
    print(f"Digit out-of-range: {int(digit_bad.sum())} dari {n} baris")

    ppm = price_per_m2_per_period(df, name, area_col, price_col)
    df["Price_per_m2_per_period_norm"] = ppm

    valid_ppm = ppm.replace([np.inf,-np.inf], np.nan).dropna()
    print("\n[Price per m2 per period (normalized) stats]")
    if len(valid_ppm) == 0:
        print("Semua NaN ‚Äî cek kolom durasi atau area.")
    else:
        print(valid_ppm.describe(percentiles=[.01,.05,.5,.95,.99]).to_string())

        # IQR outlier share (global)
        flags = iqr_flags(valid_ppm)
        print(f"IQR outliers (global): {int(flags.sum())}/{len(valid_ppm)} = {(100*flags.mean()):.2f}%")

        # threshold relatif per UnitID (0.5x .. 2x median)
        if "UnitID" in df.columns:
            unit_median = df.groupby("UnitID")["Price_per_m2_per_period_norm"].transform("median")
            anom_low  = df["Price_per_m2_per_period_norm"] < 0.5*unit_median
            anom_high = df["Price_per_m2_per_period_norm"] > 2.0*unit_median
            rel_anom  = (anom_low | anom_high) & df["Price_per_m2_per_period_norm"].notna()
            print(f"Relative anomalies vs UnitID median: {int(rel_anom.sum())}/{n} = {(100*rel_anom.mean()):.2f}%")

            if rel_anom.any():
                cols = ["UnitID", area_col, price_col, "LeaseDurationMonths", "Price_per_m2_per_period_norm"]
                cols = [c for c in cols if c in df.columns]
                print("\nContoh 10 anomali relatif:")
                print(df.loc[rel_anom, cols].head(10).to_string(index=False))

for sh in SHEETS:
    check_sheet(sh)

print("\n‚úÖ Cek normalisasi selesai.")



Rows: 3914

[BuildingArea sanity]
NaN: 0, <=0: 0, ==1: 0, (0..1): 0
Area stats (valid):
count    3.914000e+03
mean     2.041844e+04
std      1.495034e+05
min      1.000000e+01
1%       1.000000e+01
5%       1.000000e+01
50%      6.640000e+02
95%      2.853515e+04
99%      2.984528e+05
max      1.946925e+06

UnitID dengan variasi area >1 nilai: 195
                 n  uniq            std      min      max
UnitID                                                   
0PW000LG000001  27    15  148344.886716  1407864  1773698
0PC000LG000031   9     4    3451.181746       10     6903
0TG00003000001   7     4   24463.182137     8545    69331
0PC00002000053   5     3    2967.956418       10     7144
0PE000UG000019   6     3  698011.610439    36297  1946925
0PW000UG000020   4     3    7847.316442       10    15849
0PW000UG000030   4     3    7903.933557     7917    24452
0TG00005000003   3     3    6632.025508    17439    29062
0PC00004000022   2     2    7443.713086      685    11212
0LC00K02000

## File bersih

In [None]:
import pandas as pd
from pathlib import Path

INPUT_FILE = r"D:/DATA SKRIPSI/anomali_diperbaiki_periode_unit.xlsx"
OUTPUT_FILE = r"D:/DATA SKRIPSI/kontrak_sewa_bersih.xlsx"

df = pd.read_excel(INPUT_FILE, sheet_name="All_Repaired_Per_Period")

cols_to_drop = [
    "anom_low", "anom_high", "is_anomaly_unit", "unit_median",
    "anomaly_reason", "Price_per_m2_per_month", "unit_median_fix",
    "is_anomaly_after_fix", "Price_per_m2_per_month_fix",
    "Price_per_m2_per_period", "Price_per_m2_per_period_fix",
    "fix_reason",
    "BuildingArea", "CuryUnitPrice"
]

df_clean = df.drop(columns=[c for c in cols_to_drop if c in df.columns], errors="ignore")

rename_map = {
    "BuildingArea_fix": "BuildingArea",
    "CuryUnitPrice_fix": "CuryUnitPrice"
}

df_clean = df_clean.rename(columns=rename_map)

preferred_order = [
    "UnitID", "Building", "UnitArea", "UnitFloor", "UnitNum", "Period",
    "ContractType", "ContractPeriod", "BusinessType", "TranCode",
    "BuildingArea", "CuryUnitPrice",
    "LeaseStartDate", "LeaseEndDate", "LeaseDurationDays", "LeaseDurationMonths",
    "n_subunit"
]
ordered_cols = [c for c in preferred_order if c in df_clean.columns] + [
    c for c in df_clean.columns if c not in preferred_order
]
df_clean = df_clean[ordered_cols]


df_clean.to_excel(OUTPUT_FILE, index=False)
print("‚úÖ File bersih siap digunakan!")
print(f"üìÇ Disimpan di: {OUTPUT_FILE}")
print(f"Jumlah kolom akhir: {len(df_clean.columns)}")
print(f"Jumlah baris: {len(df_clean)}")


‚úÖ File bersih siap digunakan!
üìÇ Disimpan di: D:/DATA SKRIPSI/kontrak_sewa_bersih.xlsx
Jumlah kolom akhir: 24
Jumlah baris: 6687
