In [24]:
%pip install fuzzywuzzy python-levenshtein -q

In [25]:
import pandas as pd
import chardet
from fuzzywuzzy import process, fuzz
import re
from tqdm import tqdm
import warnings

# konfigurasi
warnings.filterwarnings('ignore', category=FutureWarning)

In [26]:
# fungsi bantuan
def clean_rank(rank_series):
    """mengekstrak angka dari kolom peringkat."""
    return pd.to_numeric(rank_series.astype(str).str.extract(r'(\d+)')[0], errors='coerce')

def split_uni_country(name_country):
    """memisahkan nama universitas dari negara di dataset the."""
    match = re.match(r'^(.*[a-z]|\s)(?=[A-Z][a-zA-Z\s]*$)', name_country)
    return match.group(1).strip() if match else name_country.strip()

def hunt_for_rank(uni_name, name_list, rank_dict, threshold=88):
    """mencari peringkat universitas menggunakan fuzzy matching."""
    if not name_list or not uni_name:
        return None
    best_match = process.extractOne(uni_name, name_list, scorer=fuzz.token_sort_ratio)
    if best_match and best_match[1] >= threshold:
        return rank_dict.get(best_match[0])
    return None

# pemuatan & pembersihan data
files = ['lpdp_cs.csv', 'QS_2026_cleaned.csv', 'cwur2025_cleaned.csv', 'the2025_cleaned.csv']
dataframes = {}
for f in files:
    with open(f, 'rb') as file:
        encoding = chardet.detect(file.read())['encoding']
    dataframes[f.split('.')[0]] = pd.read_csv(f, encoding=encoding)

df_lpdp = dataframes['lpdp_cs']
df_qs = dataframes['QS_2026_cleaned']
df_cwur = dataframes['cwur2025_cleaned']
df_the = dataframes['the2025_cleaned']

df_qs['Rank'] = clean_rank(df_qs['Rank'])
df_cwur['World Rank'] = clean_rank(df_cwur['World Rank'])
df_the['Rank'] = clean_rank(df_the['Rank'])
df_the['cleaned name'] = df_the['Name Country/Region'].apply(split_uni_country)


# proses pencocokan fuzzy
qs_dict = df_qs.set_index('Institution')['Rank'].to_dict()
cwur_dict = df_cwur.set_index('Institution')['World Rank'].to_dict()
the_dict = df_the.set_index('cleaned name')['Rank'].to_dict()


# proses pencocokan menggunakan list comprehension
df_lpdp['qs rank 2026'] = [hunt_for_rank(name, list(qs_dict.keys()), qs_dict) for name in df_lpdp['Name']]
df_lpdp['cwur rank 2025'] = [hunt_for_rank(name, list(cwur_dict.keys()), cwur_dict) for name in df_lpdp['Name']]
df_lpdp['the rank 2025'] = [hunt_for_rank(name, list(the_dict.keys()), the_dict) for name in df_lpdp['Name']]

# simpan & tampilkan hasil
df_lpdp.to_csv('merged_rankings_best_result.csv', index=False)

print("hasil disimpan ke 'merged_rankings_best_result.csv'.")
print("\npratinjau 30 baris pertama:")
print(df_lpdp.head(30))

hasil disimpan ke 'merged_rankings_best_result.csv'.

pratinjau 30 baris pertama:
    No                                               Name           Negara  \
0    1                               University of Oxford          Inggris   
1    2                                 Harvard University  Amerika Serikat   
2    3                                Stanford University  Amerika Serikat   
3    4        Massachusetts Institute of Technology (MIT)  Amerika Serikat   
4    5                            University of Cambridge          Inggris   
5    6       California Institute of Technology (Caltech)  Amerika Serikat   
6    7                              University of Chicago  Amerika Serikat   
7    8                                    Yale University  Amerika Serikat   
8    9                               Princeton University  Amerika Serikat   
9   10                            Imperial College London          Inggris   
10  11                         University of Pennsylvania  A

In [27]:
# fungsi bantuan verifikasi
def clean_rank_value(rank_str):
    if pd.isna(rank_str): return None
    match = re.search(r'(\d+)', str(rank_str))
    return float(match.group(1)) if match else None

def verify_rank(university_name, df_raw, name_col, rank_col):
    row = df_raw[df_raw[name_col].str.contains(university_name.split('(')[0].strip(), na=False, case=False)]
    if not row.empty:
        return clean_rank_value(row.iloc[0][rank_col])
    return None

# pemuatan & eksekusi verifikasi
try:
    final_df = pd.read_csv('merged_rankings_best_result.csv')
    qs_raw = pd.read_csv('QS_2026_cleaned.csv')
    cwur_raw = pd.read_csv('cwur2025_cleaned.csv')
    the_raw = pd.read_csv('the2025_cleaned.csv')
except FileNotFoundError as e:
    print(f"verifikasi gagal: file '{e.filename}' tidak ditemukan.")
    exit()

mismatches = []
for index, row in tqdm(final_df.iterrows(), total=final_df.shape[0], desc="memverifikasi"):
    uni_name = row['Name']
    # bandingkan peringkat merge dengan data asli
    merged_qs, merged_cwur, merged_the = row['qs rank 2026'], row['cwur rank 2025'], row['the rank 2025']
    original_qs = verify_rank(uni_name, qs_raw, 'Institution', 'Rank')
    original_cwur = verify_rank(uni_name, cwur_raw, 'Institution', 'World Rank')
    original_the = verify_rank(uni_name, the_raw, 'Name Country/Region', 'Rank')

    if not pd.isna(merged_qs) and merged_qs != original_qs:
        mismatches.append({'universitas': uni_name, 'sumber': 'qs', 'merge': merged_qs, 'asli': original_qs})
    if not pd.isna(merged_cwur) and merged_cwur != original_cwur:
        mismatches.append({'universitas': uni_name, 'sumber': 'cwur', 'merge': merged_cwur, 'asli': original_cwur})
    if not pd.isna(merged_the) and merged_the != original_the:
        mismatches.append({'universitas': uni_name, 'sumber': 'the', 'merge': merged_the, 'asli': original_the})

# tampilkan hasil verifikasi
if mismatches:
    mismatch_df = pd.DataFrame(mismatches)
    mismatch_df.to_csv('ketidakcocokan_verifikasi.csv', index=False)
    print(f"\nverifikasi selesai: {len(mismatch_df)} ketidakcocokan ditemukan.")
    print("hasil disimpan ke 'ketidakcocokan_verifikasi.csv'.")
    print("\npratinjau data ketidakcocokan:")
    print(mismatch_df.head())
else:
    print("\nverifikasi selesai. tidak ada ketidakcocokan.")

memverifikasi: 100%|██████████| 158/158 [00:00<00:00, 180.21it/s]


verifikasi selesai: 54 ketidakcocokan ditemukan.
hasil disimpan ke 'ketidakcocokan_verifikasi.csv'.

pratinjau data ketidakcocokan:
                                         universitas sumber  merge   asli
0  ETH Zurich - Swiss Federal Institute of Techno...     qs    7.0    NaN
1                    UCL (University College London)   cwur   20.0  650.0
2                   University of Michigan-Ann Arbor   cwur   16.0    NaN
3                        The University of Edinburgh     qs   34.0    NaN
4                        The University of Edinburgh   cwur   51.0    NaN





In [28]:
import pandas as pd
import re

# fungsi bantuan
def clean_rank(rank_series):
    """mengekstrak angka dari kolom peringkat."""
    return pd.to_numeric(rank_series.astype(str).str.extract(r'(\d+)')[0], errors='coerce')

def split_uni_country(name_country):
    """memisahkan nama universitas dari negara di dataset the."""
    match = re.match(r'^(.*[a-z]|\s)(?=[A-Z][a-zA-Z\s]*$)', name_country)
    return match.group(1).strip() if match else name_country.strip()


# catatan: 'merged_rankings_final_curated.csv' adalah hasil kurasi manual dari sel sebelumnya.
try:
    df_main = pd.read_csv('merged_rankings_best_result.csv')
    df_cs_raw = pd.read_csv('thecomputerscience2025.csv')
except FileNotFoundError as e:
    print(f"gagal: file '{e.filename}' tidak ditemukan.")
    # exit() # Remove the exit() call

# pembersihan data cs
df_cs_raw['Rank'] = clean_rank(df_cs_raw['Rank'])
df_cs_raw['name'] = df_cs_raw['Name Country/Region'].apply(split_uni_country)

df_cs_cleaned = df_cs_raw[['Rank', 'name']].copy()
df_cs_cleaned.rename(columns={'Rank': 'the cs rank 2025'}, inplace=True)

# penggabungan & penyimpanan
df_merged_final = pd.merge(df_main, df_cs_cleaned, left_on='Name', right_on='name', how='left')

final_filename = 'final_merged_with_cs_ranking.csv'
df_merged_final.to_csv(final_filename, index=False)

print(f"hasil akhir disimpan ke '{final_filename}'.")
print("\npratinjau data final dengan peringkat cs:")
print(df_merged_final.head(30))

hasil akhir disimpan ke 'final_merged_with_cs_ranking.csv'.

pratinjau data final dengan peringkat cs:
    No                                               Name           Negara  \
0    1                               University of Oxford          Inggris   
1    2                                 Harvard University  Amerika Serikat   
2    3                                Stanford University  Amerika Serikat   
3    4        Massachusetts Institute of Technology (MIT)  Amerika Serikat   
4    5                            University of Cambridge          Inggris   
5    6       California Institute of Technology (Caltech)  Amerika Serikat   
6    7                              University of Chicago  Amerika Serikat   
7    8                                    Yale University  Amerika Serikat   
8    9                               Princeton University  Amerika Serikat   
9   10                            Imperial College London          Inggris   
10  11                         Universi

In [30]:
# pemuatan & eksekusi re-verifikasi
try:
    final_df_with_cs = pd.read_csv('final_merged_with_cs_ranking.csv')
    the_cs_raw = pd.read_csv('thecomputerscience2025.csv')
except FileNotFoundError as e:
    print(f"re-verifikasi gagal: file '{e.filename}' tidak ditemukan.")
    exit()

the_cs_raw['Rank'] = clean_rank(the_cs_raw['Rank'])
the_cs_raw['cleaned name'] = the_cs_raw['Name Country/Region'].apply(split_uni_country)

the_cs_dict = the_cs_raw.set_index('cleaned name')['Rank'].to_dict()

cs_mismatches = []
for index, row in tqdm(final_df_with_cs.iterrows(), total=final_df_with_cs.shape[0], desc="re-verifying cs"):
    uni_name = row['Name']
    merged_cs_rank = row['the cs rank 2025']

    # Get the original CS rank using the dictionary lookup
    original_cs_rank = the_cs_dict.get(uni_name)

    if not pd.isna(merged_cs_rank) and merged_cs_rank != original_cs_rank:
        cs_mismatches.append({'universitas': uni_name, 'sumber': 'the cs', 'merge': merged_cs_rank, 'asli': original_cs_rank})

# display re-verification results
if cs_mismatches:
    mismatch_df_cs = pd.DataFrame(cs_mismatches)
    print(f"\nre-verifikasi selesai: {len(mismatch_df_cs)} ketidakcocokan cs ditemukan.")
    print(mismatch_df_cs.head())
else:
    print("\nre-verifikasi selesai. tidak ada ketidakcocokan.")

re-verifying cs: 100%|██████████| 158/158 [00:00<00:00, 16503.14it/s]


re-verifikasi selesai. tidak ada ketidakcocokan.



