In [13]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score

# Load data
train_df = pd.read_csv("data/Hospital_Train.csv")
test_df = pd.read_csv("data/Hospital_Test.csv")

# Clean data
train_df_clean = train_df.dropna(subset=["Hospital Name rev 2"])
X_train = train_df_clean["Hospital_Name (clean)"]
y_train = train_df_clean["Hospital Name rev 2"]
X_test = test_df["Hospital_Name (clean)"]
y_test = test_df["Hospital Name rev 2"]

# TF-IDF vectorization
vectorizer = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Cosine similarity prediction
cos_sim_matrix = cosine_similarity(X_test_tfidf, X_train_tfidf)
cosine_top_indices = cos_sim_matrix.argmax(axis=1)
cosine_preds = y_train.iloc[cosine_top_indices].values
cosine_top_scores = cos_sim_matrix.max(axis=1)

# Format clean name
def format_clean_name(name):
    return re.sub(r'[^\w\s&]', '', name).upper().strip()

clean_names = X_test.apply(format_clean_name)

# Combine predictions + cek token yang tidak ada di vocab TF-IDF
final_preds = []
tfidf_zero_flags = []

for i in range(len(X_test)):
    tfidf_row = X_test_tfidf[i]
    nonzero_indices = tfidf_row.nonzero()[1]
    total_vocab_size = X_test_tfidf.shape[1]
    has_zero = len(nonzero_indices) < total_vocab_size

    tfidf_zero_flags.append(has_zero)

    if cosine_preds[i] == y_test.iloc[i]:
        final_preds.append(cosine_preds[i])
    else:
        final_preds.append(clean_names.iloc[i])

# Buat dataframe hasil akhir
result_df = pd.DataFrame({
    "Hospital_Name_clean": X_test,
    "Actual": y_test,
    "Cosine_Pred": cosine_preds,
    "Clean_Pred": clean_names,
    "Final_Pred": final_preds,
    "Cosine_Score": cosine_top_scores,
    "TF-IDF_Zero_Found": tfidf_zero_flags
})

# Bersihkan spasi ganda
result_df["Final_Pred"] = result_df["Final_Pred"].str.replace(r"\s+", " ", regex=True).str.strip()

# Tampilkan baris yang beda
mismatch_df = result_df[result_df["Actual"] != result_df["Final_Pred"]]

# Predict and evaluate
accuracy = accuracy_score(y_test, final_preds)

# Output
print("Accuracy:", accuracy)
print("🔥 Total mismatch:", len(mismatch_df))
print("🧠 Total input yang punya frasa OOV (TF-IDF = 0):", sum(tfidf_zero_flags))
mismatch_df.head(10)

Accuracy: 0.8009708737864077
🔥 Total mismatch: 41
🧠 Total input yang punya frasa OOV (TF-IDF = 0): 206


Unnamed: 0,Hospital_Name_clean,Actual,Cosine_Pred,Clean_Pred,Final_Pred,Cosine_Score,TF-IDF_Zero_Found
4,PUSAT PAKAR PERUBATAN DR ANWAR BONE JOINT AND ...,PUSAT PAKAR PERUBATAN DR ANWAR,PUSAT PAKAR PERUBATAN DR ANWAR,PUSAT PAKAR PERUBATAN DR ANWAR BONE JOINT AND ...,PUSAT PAKAR PERUBATAN DR ANWAR,0.739388,True
16,RS KHUSUS ORTHOEDI KARIMA UTAMA,RS KHUSUS ORTHOPEDI KARIMA UTAMA,RS KHUSUS BEDAH KARIMA UTAMA,RS KHUSUS ORTHOEDI KARIMA UTAMA,RS KHUSUS ORTHOEDI KARIMA UTAMA,0.771471,True
32,KLINIK UTAMA THT-MATA DR BOESOIRIE,KLINIK UTAMA DR BOESORIE,KLINIK UTAMA,KLINIK UTAMA THTMATA DR BOESOIRIE,KLINIK UTAMA THTMATA DR BOESOIRIE,0.601101,True
34,RS THT PROKLAMASI,RS KHUSUS THT BEDAH PROKLAMASI,RS THT BEDAH PROKLAMASI,RS THT PROKLAMASI,RS THT PROKLAMASI,0.614295,True
40,HERMINA HOSPITALS MADIUN,RS HERMINA MADIUN,RS SANTA CLARA MADIUN,HERMINA HOSPITALS MADIUN,HERMINA HOSPITALS MADIUN,0.398326,True
55,SLAND HOSPITAL,ISLAND HOSPITAL MALAYSIA,EKA HOSPITAL,SLAND HOSPITAL,SLAND HOSPITAL,0.417263,True
57,PRAKTEK DR PAULUS SUGIANTO,RS DR PAULUS SUGIANTO,DR PAULUS SUGIANTO SPS,PRAKTEK DR PAULUS SUGIANTO,PRAKTEK DR PAULUS SUGIANTO,0.877913,True
65,EKA HOSPITALS BEKASI,EKA HOSPITAL BEKASI,EKA HOSPITAL,EKA HOSPITALS BEKASI,EKA HOSPITALS BEKASI,0.740522,True
66,RSIA BUN,RS BUNDA JAKARTA,RS BUN,RSIA BUN,RSIA BUN,0.587421,True
67,REVIEW DOR 22 -DOKUMEN LAPORAN OP SUDAH DIKIRI...,RS BETHESDA YOGYAKARTA,RS BETHESDA,REVIEW DOR 22 DOKUMEN LAPORAN OP SUDAH DIKIRIM...,REVIEW DOR 22 DOKUMEN LAPORAN OP SUDAH DIKIRIM...,0.388503,True


In [7]:
# Optional: save to Excel
result_df.to_excel("NLP_Cosine_Pred.xlsx", index=False)
print("✅ Predictions saved to 'NLP_Cosine_Pred.xlsx'")

✅ Predictions saved to 'NLP_Cosine_Pred.xlsx'


In [8]:
def predict_single_name(hospital_name_clean, actual_label=None):
    input_clean = format_clean_name(hospital_name_clean)
    input_vector = vectorizer.transform([hospital_name_clean])
    
    # Cosine similarity
    cos_sim = cosine_similarity(input_vector, X_train_tfidf)
    top_cos_idx = cos_sim.argmax()
    cosine_pred = y_train.iloc[top_cos_idx]

    # Optional: kalau actual_label dikasih, kita bisa bandingkan
    if actual_label is not None and cosine_pred == actual_label:
        return cosine_pred
    else:
        return input_clean  # fallback

In [9]:
predict_single_name("SLAND HOSPITAL")

'SLAND HOSPITAL'