In [9]:
import pandas as pd

# Load the datasets
country_profile_data = pd.read_csv('country_profile_variables.csv')
qs_ranking_data = pd.read_csv('qs-world-university-rankings-2017-to-2022-V2.csv')

# Extract the unique country names from each dataset
country_profile_countries = set(country_profile_data['country'].unique())
qs_ranking_countries = set(qs_ranking_data['country'].unique())

# Find mismatched country names
mismatched_countries = country_profile_countries.symmetric_difference(qs_ranking_countries)

# Convert the mismatched countries to a DataFrame for better visualization
mismatched_countries_df = pd.DataFrame(mismatched_countries, columns=['Mismatched Country Names'])

# Display the mismatched country names
pd.set_option('display.max_rows', None)
print(mismatched_countries_df)


                      Mismatched Country Names
0                        Sao Tome and Principe
1                                     Ethiopia
2                             China, Macao SAR
3                                      Myanmar
4                                        Tonga
5                                        Nauru
6                                       Malawi
7                                   Seychelles
8                     United States of America
9                                      Liberia
10                                     Algeria
11                                     Bolivia
12                                     Namibia
13                 Falkland Islands (Malvinas)
14                            French Polynesia
15                United States Virgin Islands
16                              Cayman Islands
17                                        Fiji
18                                    Dominica
19                   Sint Maarten (Dutch part)
20           

In [7]:
import pandas as pd

# Load the datasets
country_profile_data = pd.read_csv('country_profile_variables.csv')
qs_ranking_data = pd.read_csv('qs-world-university-rankings-2017-to-2022-V2.csv')

# Extract the unique country names from each dataset
country_profile_countries = set(country_profile_data['country'].unique())
qs_ranking_countries = set(qs_ranking_data['country'].unique())

# Find the union of all unique country names
all_countries = country_profile_countries.union(qs_ranking_countries)

# Create a DataFrame to store country names from both datasets
country_comparison = pd.DataFrame(all_countries, columns=['All Countries'])

# Add columns for country names as they appear in each file
country_comparison['In Country Profile'] = country_comparison['All Countries'].apply(
    lambda x: x if x in country_profile_countries else None
)
country_comparison['In QS Ranking'] = country_comparison['All Countries'].apply(
    lambda x: x if x in qs_ranking_countries else None
)

# Display the resulting DataFrame
print(country_comparison)


                        All Countries                In Country Profile  \
0                              Canada                            Canada   
1               Sao Tome and Principe             Sao Tome and Principe   
2                              Jordan                            Jordan   
3                        South Africa                      South Africa   
4                    China, Macao SAR                  China, Macao SAR   
..                                ...                               ...   
238  Democratic Republic of the Congo  Democratic Republic of the Congo   
239                        Bangladesh                        Bangladesh   
240                 Equatorial Guinea                 Equatorial Guinea   
241                             Italy                             Italy   
242                  Papua New Guinea                  Papua New Guinea   

    In QS Ranking  
0          Canada  
1            None  
2          Jordan  
3    South Africa  

In [6]:
import pandas as pd
import difflib

# Load the datasets
country_profile_data = pd.read_csv('country_profile_variables.csv')
qs_ranking_data = pd.read_csv('qs-world-university-rankings-2017-to-2022-V2.csv')

# Extract unique country names from each dataset
country_profile_countries = country_profile_data['country'].unique()
qs_ranking_countries = qs_ranking_data['country'].unique()

# Find mismatched country names using a dictionary
mismatch_dict = {}

for country in country_profile_countries:
    closest_matches = difflib.get_close_matches(country, qs_ranking_countries, n=1, cutoff=0.8)
    if closest_matches and country.lower() != closest_matches[0].lower():
        mismatch_dict[country] = closest_matches[0]

# Display mismatched country names
mismatch_dict_df = pd.DataFrame(list(mismatch_dict.items()), columns=['Country Profile', 'QS Ranking'])
print(mismatch_dict_df)

# Rename columns in the country profile data to match QS ranking data
country_profile_data['country'] = country_profile_data['country'].replace(mismatch_dict)

# Verify the renaming
country_profile_countries_updated = country_profile_data['country'].unique()
qs_ranking_countries_updated = qs_ranking_data['country'].unique()

# Find remaining mismatched country names
remaining_mismatches = set(country_profile_countries_updated).symmetric_difference(set(qs_ranking_countries_updated))
remaining_mismatches_df = pd.DataFrame(remaining_mismatches, columns=['Remaining Mismatched Country Names'])
print("-----------")
print(remaining_mismatches_df)


              Country Profile                 QS Ranking
0                     Iceland                    Ireland
1  Iran (Islamic Republic of)  Iran, Islamic Republic of
2                    Viet Nam                    Vietnam
-----------
    Remaining Mismatched Country Names
0                Sao Tome and Principe
1                             Ethiopia
2                     China, Macao SAR
3                              Myanmar
4                                Tonga
..                                 ...
150   Democratic Republic of the Congo
151                          Nicaragua
152                  Equatorial Guinea
153                            Eritrea
154                   Papua New Guinea

[155 rows x 1 columns]
