In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.13.0


In [5]:
# ============================================
# 1. IMPORT LIBRARY
# ============================================
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from rapidfuzz import fuzz, process

# ============================================
# 2. LOAD DATA
# ============================================
train_df = pd.read_csv('/content/drive/MyDrive/Hospital_Train_new.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Hospital_Test_new.csv')

# ============================================
# 3. CLEAN NOISE WORDS FUNCTION
# ============================================
def clean_noise_words(text):
    noise = [
        "MOHOH", "NASABAH", "DAPAT","MELAMPIRKAN","DOKUMEN","PROSES", "SEDANG", "KLAIM","TUNDA",
        "CLAIM","PENDING", "REVIEW", "DOR", "DITINDAKLANJUTI", "DENGAN", "PIHAK", "DX", "AKHIR",
        "JANUARI", "FEBRUARI", "MARET", "APRIL", "MEI", "MOHON", "KEKURANGAN","BERIKUT",
        "JUNI", "JULI", "AGUSTUS", "SEPTEMBER", "OKTOBER", "NOVEMBER", "DESEMBER", "TOLAK",
        "KONSUL", "TERKAIT", "PEMERIKSAAN", "APA","SAJA","TIDAK","KOLERASI","DIAGNOSA", "ESKALASI",
        "TAX","INVOICE","ASLI","TOTAL","TAGIHAN", "YANG","SESUAI","EMAIL","AIA","TANGGAL","DILAKUKAN",
        "BERDIRI","INFORMASI","MEDIS","LANJUTAN","SELAMA","TERPISAH","OLEH","LAMA","PENGISIAN",
        "UNIT", "PEMERINTAH",
        "2023","2024","2025","2026",
        "21","22","23","24","25","26","27","28","29","30","31"
    ]
    for word in noise:
        text = text.replace(word, "")
    return re.sub(r"\s+", " ", text).strip()

# ============================================
# 4. CLEAN & PREPROCESS
# ============================================
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = re.sub(r'[^\w\s&]', '', text)
    text = text.upper().strip()
    text = clean_noise_words(text)
    return text

train_df["clean_name"] = train_df["Hospital_Name (clean)"].apply(clean_text)
test_df["clean_name"] = test_df["Hospital_Name (clean)"].apply(clean_text)

X_train_text = train_df["clean_name"]
y_train = train_df["Hospital Name rev 2"]
X_test_text = test_df["clean_name"]
y_test = test_df["Hospital Name rev 2"]

# ============================================
# 5. VECTORIZER (n-gram 1,1 + 1,2 + 1,3)
# ============================================
vectorizer = FeatureUnion([
    ('tfidf_1_1', TfidfVectorizer(ngram_range=(1,1), sublinear_tf=True)),
    ('tfidf_1_2', TfidfVectorizer(ngram_range=(1,2), sublinear_tf=True)),
    ('tfidf_1_3', TfidfVectorizer(ngram_range=(1,3), sublinear_tf=True))
])

X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf = vectorizer.transform(X_test_text)

# ============================================
# 6. COSINE SIMILARITY
# ============================================
cos_sim_matrix = cosine_similarity(X_test_tfidf, X_train_tfidf)
cosine_scores = cos_sim_matrix.max(axis=1)
cosine_indices = cos_sim_matrix.argmax(axis=1)
cosine_preds = y_train.iloc[cosine_indices].values

# ============================================
# 7. FUZZY MATCHING
# ============================================
fuzzy_preds = []
fuzzy_scores = []

for text in X_test_text:
    match, score, _ = process.extractOne(
        text,
        y_train,
        scorer=fuzz.token_sort_ratio
    )
    fuzzy_preds.append(match)
    fuzzy_scores.append(score)

# ============================================
# 8. META FEATURES FOR CLASSIFIER
# ============================================
meta_features = pd.DataFrame({
    "cosine_score": cosine_scores,
    "fuzzy_score": fuzzy_scores,
    "word_count": X_test_text.str.split().apply(len),
    "contains_SGD": X_test_text.str.contains("SGD").astype(int),
    "contains_MYR": X_test_text.str.contains("MYR").astype(int),
})

# Label: apakah cosine pred == actual (untuk training)
meta_features_train = pd.DataFrame({
    "cosine_score": cosine_similarity(X_train_tfidf, X_train_tfidf).max(axis=1),
    "fuzzy_score": [fuzz.token_sort_ratio(a,b) for a,b in zip(X_train_text, y_train)],
    "word_count": X_train_text.str.split().apply(len),
    "contains_SGD": X_train_text.str.contains("SGD").astype(int),
    "contains_MYR": X_train_text.str.contains("MYR").astype(int),
})

train_cosine_preds = y_train.iloc[np.argmax(cosine_similarity(X_train_tfidf, X_train_tfidf), axis=1)].values
label_train = (train_cosine_preds == y_train).astype(int)

# ============================================
# 9. TRAIN LOGISTIC REGRESSION
# ============================================
clf = LogisticRegression(penalty="l2", solver="saga", C=0.1, max_iter=500)
clf.fit(meta_features_train, label_train)

# ============================================
# 10. PREDICT CLASSIFIER PROBABILITY
# ============================================
prob_cosine_correct = clf.predict_proba(meta_features)[:,1]

# ============================================
# 11. THRESHOLD FALLBACK DECISION LOGIC
# ============================================
final_preds = []
cosine_thresh = 0.8
fuzzy_thresh = 90
classifier_prob_thresh = 0.6

for i in range(len(X_test_text)):
    text = X_test_text.iloc[i]
    wc = len(text.split())

    if cosine_scores[i] >= cosine_thresh:
        pred = cosine_preds[i]
    elif fuzzy_scores[i] >= fuzzy_thresh:
        pred = fuzzy_preds[i]
    elif prob_cosine_correct[i] >= classifier_prob_thresh:
        pred = cosine_preds[i]
    else:
        if wc > 20:
            if "MYR" in text:
                pred = "NPK - MALAYSIA"
            elif "SGD" in text:
                pred = "NPK - SINGAPORE"
            else:
                pred = "NPK - INDONESIA"
        else:
            pred = "NPK - INDONESIA"
    final_preds.append(pred)

# ============================================
# 12. EVALUATE
# ============================================
acc = accuracy_score(y_test, final_preds)
print(f"🎯 Accuracy: {acc:.4f}")
print(f"🔥 Total mismatch: {(y_test != final_preds).sum()}")

# Optionally: save result
result_df = pd.DataFrame({
    "Input": X_test_text,
    "Actual": y_test,
    "Cosine_Pred": cosine_preds,
    "Fuzzy_Pred": fuzzy_preds,
    "Final_Pred": final_preds,
    "Cosine_Score": cosine_scores,
    "Fuzzy_Score": fuzzy_scores,
    "Classifier_Prob": prob_cosine_correct
})

mismatch_df = result_df[result_df["Actual"] != result_df["Final_Pred"]]
mismatch_df.head(10)


🎯 Accuracy: 0.6129
🔥 Total mismatch: 60




Unnamed: 0,Input,Actual,Cosine_Pred,Fuzzy_Pred,Final_Pred,Cosine_Score,Fuzzy_Score,Classifier_Prob
7,KLINIK HERU P KUNTONO,KLINIK HERU P KUNTONO,RS GADING PLUIT,KLINIK KUNCUP CERIA,RS GADING PLUIT,0.642468,70.0,0.98437
14,JAWABAN CL KELUHAN DEMAM SAMPAI HARI KE 7 SUHU...,RS ST BORROMEUS,RSU MITRA MEDIKA PREMIER,CENTRE FOR EAR NOSE THROAT ALLERGY N SLEEP MOU...,RSU MITRA MEDIKA PREMIER,0.248344,28.011204,0.999894
15,DR DJ CROSS MB BS,DR DJ CROSS MB BS,MANDAYA ROYAL HOSPITAL,DRG DJODY ASMORO,MANDAYA ROYAL HOSPITAL,0.250096,54.545455,0.972099
21,KITANO HOSPITAL UMEDA OSAKA,KITANO HOSPITAL UMEDA OSAKA,OSAKA CITY GENERAL HOSPITAL,TAKEDA HOSPITAL,OSAKA CITY GENERAL HOSPITAL,0.403463,71.428571,0.983977
29,CAH SPECIALIST FARRER,FARRER PARK HOSPITAL SINGAPORE,RS PONDOK INDAH BINTARO,DR ALICIA EYE SPECIALIST,RS PONDOK INDAH BINTARO,0.348474,71.111111,0.980764
33,GYNAE ONCO PARTNERS,GYNAE ONCOLOGY CENTRE PTE LTD SINGAPORE,ONCOCARE CANCER CENTRE SINGAPORE,CARE COLLAB PARTNERS,ONCOCARE CANCER CENTRE SINGAPORE,0.338363,71.794872,0.981204
34,GYNAE ONCO PARTNERS,GYNAE ONCOLOGY CENTRE PTE LTD SINGAPORE,ONCOCARE CANCER CENTRE SINGAPORE,CARE COLLAB PARTNERS,ONCOCARE CANCER CENTRE SINGAPORE,0.338363,71.794872,0.981204
37,CARRINGTON CARDIOLOGY PTE LTD,CARRINGTON CARDIOLOGY PTE LTD,TT LIM CARDIOLOGY CLINIC SINGAPORE,ACE CARDIOLOGY CLINIC PTE LTD,TT LIM CARDIOLOGY CLINIC SINGAPORE,0.674764,75.862069,0.987704
39,FUNCTIONAL MOVEMENT TRAINING CENTRE,FUNCTIONAL MOVEMENT TRAINING CENTRE,MOVEMENT 101,NATIONAL SKIN CENTRE SINGAPORE,MOVEMENT 101,0.568686,61.538462,0.977738
43,RUMAH SAKIT UNTUK 1 KONFIRMASI RINCIAN BIAYA M...,NPK - INDONESIA,HOSPITAL OVERSEAS ADMEDIKA,IHC RUMAH SAKIT OTAK DAN JANTUNG PERTAMINA ROY...,HOSPITAL OVERSEAS ADMEDIKA,0.278514,43.010753,0.993029
