In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
# Load the data from the uploaded file
file_path = '../../datasets/new_platform/about_our_data/about_data.xlsx'
indicator_df = pd.read_excel(file_path)
#indicator_df = ddt_df[['indicator_ddt_name', 'indicator_source', 'indicator_ddt_cat']].drop_duplicates()

In [23]:
indicator_df = indicator_df.reset_index()
indicator_df = indicator_df.drop(columns=['index'])
indicator_df['combined'] = indicator_df['indicator_ddt_name'] + ' ' + indicator_df['indicator_source'] #+ ' ' + str(indicator_df['indicator_year'])

In [24]:
print(indicator_df.head().to_markdown())

|    |   Unnamed: 0 | indicator_ddt_name                 | indicator_source   |   indicator_year | web_link                                                        | combined                                      |
|---:|-------------:|:-----------------------------------|:-------------------|-----------------:|:----------------------------------------------------------------|:----------------------------------------------|
|  0 |            1 | Average Length of Stay             | UNWTO              |             2022 | https://www.unwto.org/tourism-statistics/key-tourism-statistics | Average Length of Stay UNWTO                  |
|  1 |            2 | Expenditure on Passenger Transport | UNWTO              |             2022 | https://www.unwto.org/tourism-statistics/key-tourism-statistics | Expenditure on Passenger Transport UNWTO      |
|  2 |            3 | Expenditure on Passenger Transport | World Bank         |             2020 | https://data.worldbank.org/                      

In [25]:
def find_similar_indicators(indicator_name, df, top_n=3):
    # Vectorize the 'combined' column using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['combined'])
    
    # Find the row index for the given indicator_name

    input_vec = vectorizer.transform([indicator_name])
    # Compute cosine similarity with all other rows
    cosine_similarities = cosine_similarity(input_vec, tfidf_matrix).flatten()
    
    # # Get the indices of the top n+1 similar indicators (excluding the input row itself)
    similar_indices = cosine_similarities.argsort()[-4:-1]
    
    # # Return the most similar indicators
    return df.iloc[similar_indices]
    #return cosine_similarities
# Example usage
input_indicator_name = indicator_df.iloc[4]['combined'] # Replace with your actual input

similar_indicators = find_similar_indicators(input_indicator_name, indicator_df)
similar_indicators

Unnamed: 0.1,Unnamed: 0,indicator_ddt_name,indicator_source,indicator_year,web_link,combined
20,21,Total Expenditure on Outbound Tourism,World Bank,2020.0,https://data.worldbank.org/,Total Expenditure on Outbound Tourism World Bank
17,18,Total Expenditure on Inbound Tourism,World Bank,2020.0,https://data.worldbank.org/,Total Expenditure on Inbound Tourism World Bank
3,4,Expenditure on Travel,UNWTO,2022.0,https://www.unwto.org/tourism-statistics/key-t...,Expenditure on Travel UNWTO


In [26]:
def obtain_sim_indic(row):
    indic = row['combined']
    sim_indics = find_similar_indicators(indic).indicator_ddt_name.tolist()
    row['sim_indic_1'] = sim_indics[0]
    row['sim_indic_2'] = sim_indics[1]
    row['sim_indic_3'] = sim_indics[2]

In [27]:
indicator_df['sim_indic_1'] = ""
indicator_df['sim_indic_2'] = ""
indicator_df['sim_indic_3'] = ""

In [33]:
for i, row in indicator_df.iterrows():
    sim_indics = find_similar_indicators(row['combined'], indicator_df).indicator_ddt_name.to_list()
    
    # Assign values directly to the DataFrame using 'i' (the index)
    indicator_df.loc[i, 'sim_indic_1'] = sim_indics[0]# if len(sim_indics) > 0 else None
    indicator_df.loc[i, 'sim_indic_2'] = sim_indics[1]# if len(sim_indics) > 1 else None
    indicator_df.loc[i, 'sim_indic_3'] = sim_indics[2] #if len(sim_indics) > 2 else None


In [34]:
indicator_df

Unnamed: 0.1,Unnamed: 0,indicator_ddt_name,indicator_source,indicator_year,web_link,combined,sim_indic_1,sim_indic_2,sim_indic_3
0,1,Average Length of Stay,UNWTO,2022.0,https://www.unwto.org/tourism-statistics/key-t...,Average Length of Stay UNWTO,"Official exchange rate (LCU per US$, period av...",Total Number of Rooms,Total Number of Establishments
1,2,Expenditure on Passenger Transport,UNWTO,2022.0,https://www.unwto.org/tourism-statistics/key-t...,Expenditure on Passenger Transport UNWTO,Total Expenditure on Inbound Tourism,Expenditure on Travel,Expenditure on Passenger Transport
2,3,Expenditure on Passenger Transport,World Bank,2020.0,https://data.worldbank.org/,Expenditure on Passenger Transport World Bank,Total Expenditure on Inbound Tourism,Expenditure on Travel,Expenditure on Passenger Transport
3,4,Expenditure on Travel,UNWTO,2022.0,https://www.unwto.org/tourism-statistics/key-t...,Expenditure on Travel UNWTO,Total Expenditure on Outbound Tourism,Total Expenditure on Inbound Tourism,Expenditure on Travel
4,5,Expenditure on Travel,World Bank,2020.0,https://data.worldbank.org/,Expenditure on Travel World Bank,Total Expenditure on Outbound Tourism,Total Expenditure on Inbound Tourism,Expenditure on Travel
...,...,...,...,...,...,...,...,...,...
70,71,Ookla Speedtest Global Index,Ookla,2024.0,https://www.speedtest.net/global-index,Ookla Speedtest Global Index Ookla,Africa Visa Openness Index,Corruption Index,Global Peace Index
71,72,Africa Visa Openness Index,African Development Bank,2023.0,https://www.visaopenness.org/fileadmin/uploads...,Africa Visa Openness Index African Development...,WEF Travel and Tourism Development Index (Value),WEF Travel and Tourism Development Index (Rank),Total Travelers Originating From Africa
72,73,Number of Conference Venues,ICCA,2024.0,https://portal.iccaworld.org/member-suppliers/,Number of Conference Venues ICCA,Total Number of Bed Places,Total Number of Establishments,Total Number of Rooms
73,74,Number of Airlines,IATA,2024.0,https://www.iata.org/en/about/members/airline-...,Number of Airlines IATA,Total Number of Bed Places,Total Number of Establishments,Total Number of Rooms


In [50]:
#joined_df_indic = pd.merge(ddt_df, indicator_df, on=['indicator_ddt_name', 'indicator_source', 'indicator_ddt_cat'])

In [35]:
indicator_df = indicator_df.drop(columns=['combined', 'Unnamed: 0'])

In [36]:
indicator_df.to_excel(file_path)

In [39]:
indicator_df.indicator_source.unique()

array(['UNWTO', 'World Bank', 'World Economic Forum', 'Wikidata',
       'UNESCO', 'Vision of Humanity', 'Transparency International',
       'Ookla', 'African Development Bank', 'ICCA', 'IATA'], dtype=object)