In [43]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
from rapidfuzz import process, fuzz

# =======================
# Fungsi Cleaning
# =======================
# Hapus kata-kata noise (tanggal, bulan, status)
def clean_noise_words(text):
    noise = [
        "MOHOH", "NASABAH", "DAPAT","MELAMPIRKAN","DOKUMEN","PROSES", "SEDANG", "KLAIM","TUNDA",
        "CLAIM","PENDING", "REVIEW", "DOR", "DITINDAKLANJUTI", "DENGAN", "PIHAK", "DX", "AKHIR",
        "JANUARI", "FEBRUARI", "MARET", "APRIL", "MEI", "MOHON", "KEKURANGAN","BERIKUT",
        "JUNI", "JULI", "AGUSTUS", "SEPTEMBER", "OKTOBER", "NOVEMBER", "DESEMBER", "TOLAK",
        "KONSUL", "TERKAIT", "PEMERIKSAAN", "APA","SAJA","TIDAK","KOLERASI","DIAGNOSA", "ESKALASI",
        "TAX","INVOICE","ASLI","TOTAL","TAGIHAN", "YANG","SESUAI","EMAIL","AIA","TANGGAL","DILAKUKAN",
        "BERDIRI","INFORMASI","MEDIS","LANJUTAN","SELAMA","TERPISAH","OLEH","LAMA","PENGISIAN",
        "UNIT", "PEMERINTAH",
        "2023","2024","2025","2026",
        "21","22","23","24","25","26","27","28","29","30","31"
    ]
    for word in noise:
        text = text.replace(word, "")
    return re.sub(r"\s+", " ", text).strip()

# Format nama (hapus simbol dan kapitalisasi)
def format_clean_name(name):
    cleaned = re.sub(r"[^\w\s&]", "", name)
    cleaned = re.sub(r"\s+", " ", cleaned)
    return cleaned.upper().strip()

# =======================
# Load Data
# =======================
train_df = pd.read_csv("data/Hospital_Train_new.csv")
test_df = pd.read_csv("data/Hospital_Test_new.csv")

# Bersihkan data train
train_df_clean = train_df.dropna(subset=["Hospital Name rev 2"])
X_train = train_df_clean["Hospital_Name (clean)"]
y_train = train_df_clean["Hospital Name rev 2"]

# Bersihkan data test
X_test = test_df["Hospital_Name (clean)"].apply(lambda x: format_clean_name(clean_noise_words(x)))
y_test = test_df["Hospital Name rev 2"]

# =======================
# Fit Vectorizers
# =======================
short_vec = TfidfVectorizer(ngram_range=(1,1), sublinear_tf=True)
mid_vec = TfidfVectorizer(ngram_range=(1,2), sublinear_tf=True)
long_vec = TfidfVectorizer(ngram_range=(1,3), sublinear_tf=True)
short_vec.fit(X_train)
mid_vec.fit(X_train)
long_vec.fit(X_train)

# =======================
# Init Lists
# =======================
final_preds = []
fuzzy_preds = []
fuzzy_scores = []
tfidf_zero_flags = []
tfidf_term_details = []
cosine_preds = []
cosine_top_scores = []

# =======================
# Threshold
# =======================
cosine_threshold = 0.5
fuzzy_threshold = 50
long_words_threshold = 0.3

# =======================
# Main Loop
# =======================
for i in range(len(X_test)):
    cleaned_input = X_test.iloc[i]
    actual = y_test.iloc[i]
    word_count = len(cleaned_input.split())

    # Pilih vectorizer
    vectorizer = (
        short_vec if word_count < 4
        else mid_vec if word_count < 8
        else long_vec
    )

    X_train_tfidf = vectorizer.transform(X_train)
    tfidf_row = vectorizer.transform([cleaned_input])[0]
    feature_names = vectorizer.get_feature_names_out()
    tokens = vectorizer.build_analyzer()(cleaned_input)

    # TF-IDF detail
    indices = tfidf_row.nonzero()[1]
    values = tfidf_row.data
    tfidf_tokens = {feature_names[idx] for idx in indices}
    tfidf_score_map = {feature_names[idx]: round(val, 4) for idx, val in zip(indices, values)}
    tfidf_term_details.append(tfidf_score_map)

    # OOV
    missing_tokens = set(tokens) - tfidf_tokens
    has_zero = len(missing_tokens) > 0
    tfidf_zero_flags.append(has_zero)

    # Cosine similarity
    cos_sim_vector = cosine_similarity(tfidf_row, X_train_tfidf)[0]
    cos_top_idx = cos_sim_vector.argmax()
    cosine_score = cos_sim_vector[cos_top_idx]
    cosine_pred = y_train.iloc[cos_top_idx]
    cosine_preds.append(cosine_pred)
    cosine_top_scores.append(cosine_score)

    # Fuzzy matching
    fuzzy_match, fuzzy_score, _ = process.extractOne(
        cleaned_input,
        y_train.tolist(),
        scorer=fuzz.token_sort_ratio
    )
    fuzzy_preds.append(fuzzy_match)
    fuzzy_scores.append(fuzzy_score)

    # Final decision
    if word_count > 20:
        if "MYR" in cleaned_input:
            final_preds.append("NPK - MALAYSIA")
        elif "SGD" in cleaned_input:
            final_preds.append("NPK - SINGAPORE")
        else:
            final_preds.append("NPK - INDONESIA")
    elif cosine_score > cosine_threshold:
        final_preds.append(cosine_pred)
    
    elif word_count > 9 and cosine_score > long_words_threshold:
        final_preds.append(cosine_pred)

    elif has_zero and fuzzy_score > fuzzy_threshold:
        new_input_vector = vectorizer.transform([fuzzy_match])
        new_cos_sim = cosine_similarity(new_input_vector, X_train_tfidf)[0]
        new_top_idx = new_cos_sim.argmax()
        new_cosine_pred = y_train.iloc[new_top_idx]
        new_cosine_score = new_cos_sim[new_top_idx]

        if new_cosine_score > cosine_threshold:
            final_preds.append(new_cosine_pred)
        else:
            final_preds.append(cleaned_input)

    elif fuzzy_score > fuzzy_threshold:
        final_preds.append(fuzzy_match)

    else:
        final_preds.append(cleaned_input)

# =======================
# Hasil DataFrame
# =======================
result_df = pd.DataFrame({
    "Hospital_Name_clean": X_test,
    "Actual": y_test,
    "Cosine_Pred": cosine_preds,
    "Cosine_Score": cosine_top_scores,
    "Fuzzy_Pred": fuzzy_preds,
    "Fuzzy_Score": fuzzy_scores,
    "Clean_Pred": X_test,
    "Final_Pred": final_preds,
    "TF-IDF_Zero_Found": tfidf_zero_flags,
    "TF-IDF_Terms_Score": tfidf_term_details
})

# Bersihkan spasi hasil prediksi
for col in ["Final_Pred", "Fuzzy_Pred"]:
    result_df[col] = result_df[col].str.replace(r"\s+", " ", regex=True).str.strip()

# Evaluasi
mismatch_df = result_df[result_df["Actual"] != result_df["Final_Pred"]]
accuracy = accuracy_score(y_test, result_df["Final_Pred"])

print("\n🎯 Accuracy:", accuracy)
print("🔥 Total mismatch:", len(mismatch_df))
print("🧠 Total TF-IDF OOV:", sum(tfidf_zero_flags))
mismatch_df.head(10)


🎯 Accuracy: 0.6064516129032258
🔥 Total mismatch: 61
🧠 Total TF-IDF OOV: 76


Unnamed: 0,Hospital_Name_clean,Actual,Cosine_Pred,Cosine_Score,Fuzzy_Pred,Fuzzy_Score,Clean_Pred,Final_Pred,TF-IDF_Zero_Found,TF-IDF_Terms_Score
7,KLINIK HERU P KUNTONO,KLINIK HERU P KUNTONO,RS GADING PLUIT,0.589531,KLINIK KUNCUP CERIA,70.0,KLINIK HERU P KUNTONO,RS GADING PLUIT,True,"{'heru': 0.8924, 'klinik': 0.4512}"
14,JAWABAN CL KELUHAN DEMAM SAMPAI HARI KE 7 SUHU...,RS ST BORROMEUS,RS ST BORROMEUS,0.254895,CENTRE FOR EAR NOSE THROAT ALLERGY N SLEEP MOU...,28.011204,JAWABAN CL KELUHAN DEMAM SAMPAI HARI KE 7 SUHU...,NPK - INDONESIA,True,"{'10': 0.1131, '100': 0.1309, '98': 0.1374, 'b..."
15,DR DJ CROSS MB BS,DR DJ CROSS MB BS,MANDAYA ROYAL HOSPITAL,0.226739,DRG DJODY ASMORO,54.545455,DR DJ CROSS MB BS,DRG DJODY ASMORO,True,"{'bs': 0.4294, 'cross': 0.5632, 'dj': 0.528, '..."
21,KITANO HOSPITAL UMEDA OSAKA,KITANO HOSPITAL UMEDA OSAKA,OSAKA CITY GENERAL HOSPITAL,0.37073,TAKEDA HOSPITAL,71.428571,KITANO HOSPITAL UMEDA OSAKA,TAKEDA HOSPITAL,True,"{'hospital': 0.3069, 'osaka': 0.9517}"
29,CAH SPECIALIST FARRER,FARRER PARK HOSPITAL SINGAPORE,RS PONDOK INDAH BINTARO,0.492503,DR ALICIA EYE SPECIALIST,71.111111,CAH SPECIALIST FARRER,DR ALICIA EYE SPECIALIST,False,"{'cah': 0.7337, 'farrer': 0.5409, 'specialist'..."
33,GYNAE ONCO PARTNERS,GYNAE ONCOLOGY CENTRE PTE LTD SINGAPORE,ONCOCARE CANCER CENTRE SINGAPORE,0.450748,CARE COLLAB PARTNERS,71.794872,GYNAE ONCO PARTNERS,CARE COLLAB PARTNERS,False,"{'gynae': 0.5903, 'onco': 0.5903, 'partners': ..."
34,GYNAE ONCO PARTNERS,GYNAE ONCOLOGY CENTRE PTE LTD SINGAPORE,ONCOCARE CANCER CENTRE SINGAPORE,0.450748,CARE COLLAB PARTNERS,71.794872,GYNAE ONCO PARTNERS,CARE COLLAB PARTNERS,False,"{'gynae': 0.5903, 'onco': 0.5903, 'partners': ..."
37,CARRINGTON CARDIOLOGY PTE LTD,CARRINGTON CARDIOLOGY PTE LTD,TT LIM CARDIOLOGY CLINIC SINGAPORE,0.67202,ACE CARDIOLOGY CLINIC PTE LTD,75.862069,CARRINGTON CARDIOLOGY PTE LTD,TT LIM CARDIOLOGY CLINIC SINGAPORE,True,"{'cardiology': 0.5061, 'cardiology pte': 0.613..."
38,1 RINCIAN BIAYA 11665 AUD DAN HASIL LABORATORIUM,BRISBANE PRIVATE HOSPITAL,LABORATORIUM,0.193806,INSTALASI LABORATORIUM RSIA AMANNA,60.97561,1 RINCIAN BIAYA 11665 AUD DAN HASIL LABORATORIUM,INSTALASI LABORATORIUM RSIA AMANNA,True,"{'11665': 0.3689, '11665 aud': 0.3689, 'aud': ..."
39,FUNCTIONAL MOVEMENT TRAINING CENTRE,FUNCTIONAL MOVEMENT TRAINING CENTRE,MOVEMENT 101,0.521286,NATIONAL SKIN CENTRE SINGAPORE,61.538462,FUNCTIONAL MOVEMENT TRAINING CENTRE,MOVEMENT 101,True,"{'centre': 0.4047, 'movement': 0.9144}"


In [44]:
# Optional: save to Excel
result_df.to_excel("After_Tuning_Pred.xlsx", index=False)
print("✅ Predictions saved to 'After_Tuning_Pred.xlsx'")

✅ Predictions saved to 'After_Tuning_Pred.xlsx'


### N-gram (1,2) and (1,3), Logistic Regression Classifier, Threshold Fallback

In [1]:
# ============================================
# 1. IMPORT LIBRARY
# ============================================
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from rapidfuzz import fuzz, process

# ============================================
# 2. LOAD DATA
# ============================================
train_df = pd.read_csv("data/Hospital_Train_new.csv")
test_df = pd.read_csv("data/Hospital_Test_new.csv")

# ============================================
# 3. CLEAN & PREPROCESS
# ============================================
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = re.sub(r'[^\w\s&]', '', text)
    text = text.upper().strip()
    return text

train_df["clean_name"] = train_df["Hospital_Name (clean)"].apply(clean_text)
test_df["clean_name"] = test_df["Hospital_Name (clean)"].apply(clean_text)

X_train_text = train_df["clean_name"]
y_train = train_df["Hospital Name rev 2"]
X_test_text = test_df["clean_name"]
y_test = test_df["Hospital Name rev 2"]

# ============================================
# 4. VECTORIZER (n-gram 1,2 + 1,3)
# ============================================
vectorizer = FeatureUnion([
    ('tfidf_1_2', TfidfVectorizer(ngram_range=(1,2), sublinear_tf=True)),
    ('tfidf_1_3', TfidfVectorizer(ngram_range=(1,3), sublinear_tf=True))
])

X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf = vectorizer.transform(X_test_text)

# ============================================
# 5. COSINE SIMILARITY
# ============================================
cos_sim_matrix = cosine_similarity(X_test_tfidf, X_train_tfidf)
cosine_scores = cos_sim_matrix.max(axis=1)
cosine_indices = cos_sim_matrix.argmax(axis=1)
cosine_preds = y_train.iloc[cosine_indices].values

# ============================================
# 6. FUZZY MATCHING
# ============================================
fuzzy_preds = []
fuzzy_scores = []

for text in X_test_text:
    match, score, _ = process.extractOne(
        text,
        y_train,
        scorer=fuzz.token_sort_ratio
    )
    fuzzy_preds.append(match)
    fuzzy_scores.append(score)

# ============================================
# 7. META FEATURES FOR CLASSIFIER
# ============================================
meta_features = pd.DataFrame({
    "cosine_score": cosine_scores,
    "fuzzy_score": fuzzy_scores,
    "word_count": X_test_text.str.split().apply(len),
    "contains_SGD": X_test_text.str.contains("SGD").astype(int),
    "contains_MYR": X_test_text.str.contains("MYR").astype(int),
})

# Label: apakah cosine pred == actual (untuk training)
meta_features_train = pd.DataFrame({
    "cosine_score": cosine_similarity(X_train_tfidf, X_train_tfidf).max(axis=1),
    "fuzzy_score": [fuzz.token_sort_ratio(a,b) for a,b in zip(X_train_text, y_train)],
    "word_count": X_train_text.str.split().apply(len),
    "contains_SGD": X_train_text.str.contains("SGD").astype(int),
    "contains_MYR": X_train_text.str.contains("MYR").astype(int),
})

# Label training = apakah cosine pred benar
train_cosine_preds = y_train.iloc[np.argmax(cosine_similarity(X_train_tfidf, X_train_tfidf), axis=1)].values
label_train = (train_cosine_preds == y_train).astype(int)

# ============================================
# 8. TRAIN LOGISTIC REGRESSION
# ============================================
clf = LogisticRegression(max_iter=500)
clf.fit(meta_features_train, label_train)

# ============================================
# 9. PREDICT CLASSIFIER PROBABILITY
# ============================================
prob_cosine_correct = clf.predict_proba(meta_features)[:,1]

# ============================================
# 10. THRESHOLD FALLBACK DECISION LOGIC
# ============================================
final_preds = []
cosine_thresh = 0.75
fuzzy_thresh = 85
classifier_prob_thresh = 0.6

for i in range(len(X_test_text)):
    if cosine_scores[i] >= cosine_thresh:
        pred = cosine_preds[i]
    elif fuzzy_scores[i] >= fuzzy_thresh:
        pred = fuzzy_preds[i]
    elif prob_cosine_correct[i] >= classifier_prob_thresh:
        pred = cosine_preds[i]
    else:
        if word_count > 20:
            if "MYR" in cleaned_input:
                final_preds.append("NPK - MALAYSIA")
            elif "SGD" in cleaned_input:
                final_preds.append("NPK - SINGAPORE")
            else:
                final_preds.append("NPK - INDONESIA")
    final_preds.append(pred)

# ============================================
# 11. EVALUATE
# ============================================
acc = accuracy_score(y_test, final_preds)
print(f"🎯 Accuracy: {acc:.4f}")
print(f"🔥 Total mismatch: {(y_test != final_preds).sum()}")

# Optionally: save result
result_df = pd.DataFrame({
    "Input": X_test_text,
    "Actual": y_test,
    "Cosine_Pred": cosine_preds,
    "Fuzzy_Pred": fuzzy_preds,
    "Final_Pred": final_preds,
    "Cosine_Score": cosine_scores,
    "Fuzzy_Score": fuzzy_scores,
    "Classifier_Prob": prob_cosine_correct
})

# Evaluasi
mismatch_df = result_df[result_df["Actual"] != result_df["Final_Pred"]]
accuracy = accuracy_score(y_test, result_df["Final_Pred"])

print("\n🎯 Accuracy:", accuracy)
print("🔥 Total mismatch:", len(mismatch_df))
mismatch_df.head(10)

MemoryError: Unable to allocate 18.0 GiB for an array with shape (49125, 49125) and data type float64