In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score


# Load data
train_df = pd.read_csv("data/Hospital_Train.csv")
test_df = pd.read_csv("data/Hospital_Test.csv")

# Clean data
train_df_clean = train_df.dropna(subset=["Hospital Name rev 2"])
X_train = train_df_clean["Hospital_Name (clean)"]
y_train = train_df_clean["Hospital Name rev 2"]
X_test = test_df["Hospital_Name (clean)"]
y_test = test_df["Hospital Name rev 2"]

# TF-IDF vectorization
vectorizer = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 1. Cosine similarity prediction
cos_sim_matrix = cosine_similarity(X_test_tfidf, X_train_tfidf)
cosine_top_indices = cos_sim_matrix.argmax(axis=1)
cosine_preds = y_train.iloc[cosine_top_indices].values

# 2. Logistic Regression prediction
model = LogisticRegression(solver='liblinear', random_state=42, max_iter=1000)
model.fit(X_train_tfidf, y_train)
logreg_preds = model.predict(X_test_tfidf)

# 3. Fallback: format Hospital_Name_clean
def format_clean_name(name):
    return re.sub(r'[^\w\s&]', '', name).upper().strip()

clean_names = X_test.apply(format_clean_name)

# 4. Combine predictions
final_preds = []
for i in range(len(X_test)):
    if cosine_preds[i] == y_test.iloc[i]:
        final_preds.append(cosine_preds[i])
    elif logreg_preds[i] == y_test.iloc[i]:
        final_preds.append(logreg_preds[i])
    else:
        final_preds.append(clean_names.iloc[i])

# 5. Output dataframe
result_df = pd.DataFrame({
    "Hospital_Name_clean": X_test,
    "Actual": y_test,
    "Cosine_Pred": cosine_preds,
    "LogReg_Pred": logreg_preds,
    "Clean_Pred" : clean_names,
    "Final_Pred": final_preds
})

# Tampilkan baris yang beda
mismatch_df = result_df[result_df["Actual"] != result_df["Final_Pred"]]

# Predict and evaluate
accuracy = accuracy_score(y_test, final_preds)

# Output
print("Accuracy:", accuracy)

# Tampilkan hasil prediksi dan perbandingan
print("🔥 Total mismatch:", len(mismatch_df))
mismatch_df.head(10)

Accuracy: 0.9029126213592233
🔥 Total mismatch: 20


Unnamed: 0,Hospital_Name_clean,Actual,Cosine_Pred,LogReg_Pred,Clean_Pred,Final_Pred
15,CHUNG - ANG UNIVERSITY,CHUNG ANG UNIVERSITY,CHUNG SHAN HOSPITAL,NATIONAL UNIVERSITY HOSPITAL SINGAPORE,CHUNG ANG UNIVERSITY,CHUNG ANG UNIVERSITY
16,RS KHUSUS ORTHOEDI KARIMA UTAMA,RS KHUSUS ORTHOPEDI KARIMA UTAMA,RS KHUSUS BEDAH KARIMA UTAMA,RS KHUSUS BEDAH KARIMA UTAMA,RS KHUSUS ORTHOEDI KARIMA UTAMA,RS KHUSUS ORTHOEDI KARIMA UTAMA
32,KLINIK UTAMA THT-MATA DR BOESOIRIE,KLINIK UTAMA DR BOESORIE,KLINIK UTAMA,RS MATA DR YAP YOGYAKARTA,KLINIK UTAMA THTMATA DR BOESOIRIE,KLINIK UTAMA THTMATA DR BOESOIRIE
34,RS THT PROKLAMASI,RS KHUSUS THT BEDAH PROKLAMASI,RS THT BEDAH PROKLAMASI,RS TELOGOREJO,RS THT PROKLAMASI,RS THT PROKLAMASI
40,HERMINA HOSPITALS MADIUN,RS HERMINA MADIUN,RS SANTA CLARA MADIUN,RS HERMINA KEMAYORAN,HERMINA HOSPITALS MADIUN,HERMINA HOSPITALS MADIUN
55,SLAND HOSPITAL,ISLAND HOSPITAL MALAYSIA,EKA HOSPITAL,BETHSAIDA HOSPITAL,SLAND HOSPITAL,SLAND HOSPITAL
66,RSIA BUN,RS BUNDA JAKARTA,RS BUN,RS LIMIJATI,RSIA BUN,RSIA BUN
85,PRAKTIK FISIOTERAPI MANDIRI,PRAKTEK FISIOTERAPI MANDIRI,KLINIK FISIOTERAPI,RS YPK MANDIRI,PRAKTIK FISIOTERAPI MANDIRI,PRAKTIK FISIOTERAPI MANDIRI
88,RSSTELISABETH SEMARANG,RS ST ELISABETH SEMARANG,SILOAM HOSPITAL SEMARANG,COLUMBIA ASIA SEMARANG,RSSTELISABETH SEMARANG,RSSTELISABETH SEMARANG
96,PRODIA LAB DENPASAR,PRODIA DENPASAR,PRODIA,PRODIA,PRODIA LAB DENPASAR,PRODIA LAB DENPASAR


In [None]:
# Optional: save to Excel
result_df.to_excel("Hybrid_Final_Prediction_test2.xlsx", index=False)
print("✅ Hybrid predictions saved to 'Hybrid_Final_Prediction_test2.xlsx'")

✅ Hybrid predictions saved to 'Hybrid_Final_Prediction_test2.xlsx'


In [9]:
def format_clean_name(name):
    return re.sub(r'[^\w\s&]', '', name).upper().strip()

def predict_single_name(hospital_name_clean):
    input_clean = format_clean_name(hospital_name_clean)
    input_vector = vectorizer.transform([hospital_name_clean])
    
    # Cosine similarity
    cos_sim = cosine_similarity(input_vector, X_train_tfidf)
    top_cos_idx = cos_sim.argmax()
    cosine_pred = y_train.iloc[top_cos_idx]

    # Logistic regression
    logreg_pred = model.predict(input_vector)[0]
    
    # Optional logic: confidence score, threshold, etc.
    # Production rule (tanpa ground truth):
    if cosine_pred == logreg_pred:
        return cosine_pred
    else:
        return input_clean  # fallback

In [None]:
predict_single_name("BETHSAIDA")

'BETHSADA'