In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, FactorAnalysis
import matplotlib.pyplot as plt

df_complete = pd.read_excel(r'../../datasets/ddt/ddt_complete.xlsx')
df_complete = df_complete.drop(columns=['Unnamed: 0'])
# Display the first few rows of the merged dataframe
df_complete.head()

Unnamed: 0,iso3_country_code,indicator_year,indicator_value,indicator_ddt_name,indicator_source,indicator_ddt_cat,country_name,region_name,units
0,DZA,1995,520.0,Total International Arrivals,UNWTO,Inbound Tourism-Arrivals,Algeria,Northern Africa,Thousands
1,AGO,1995,9.0,Total International Overnight Visitors (Tourists),UNWTO,Inbound Tourism-Arrivals,Angola,Central Africa,Thousands
2,BEN,1995,580.0,Total International Arrivals,UNWTO,Inbound Tourism-Arrivals,Benin,Western Africa,Thousands
3,BEN,1995,138.0,Total International Overnight Visitors (Tourists),UNWTO,Inbound Tourism-Arrivals,Benin,Western Africa,Thousands
4,BWA,1995,636.0,Total International Arrivals,UNWTO,Inbound Tourism-Arrivals,Botswana,Southern Africa,Thousands


In [2]:
df_parks = pd.read_excel(r'../../datasets/dashboards/country_cultural.xlsx', sheet_name=0, names=['countryLabel', 'iso3', 'parkLabel'])
df_sports = pd.read_excel(r'../../datasets/dashboards/country_cultural.xlsx', sheet_name=0, names=['countryLabel', 'iso3', 'parkLabel'])
df_protected = pd.read_excel(r'../../datasets/dashboards/country_cultural.xlsx', sheet_name=0, names=['countryLabel', 'iso3', 'parkLabel'])
df_relig = pd.read_excel(r'../../datasets/dashboards/country_cultural.xlsx', sheet_name=0, names=['countryLabel', 'iso3', 'parkLabel'])
df_unesco= pd.read_excel(r'../../datasets/dashboards/country_cultural.xlsx', sheet_name=0, names=['countryLabel', 'iso3', 'category'])

In [19]:
df_ookla = pd.read_excel(r"../../datasets/general/ookla_speed_test.xlsx")
df_global_peace = pd.read_excel(r"../../datasets/general/global_peace_index_2024.xlsx")
df_iata = pd.read_excel(r"../../datasets/research/iata_afr_airlines.xlsx")
df_visa= pd.read_excel(r"../../datasets/research/business_report/visa_open.xlsx")


In [20]:
df_global_peace['score'] = df_global_peace.groupby('region_name')['score'].transform(lambda x: x.fillna(x.mean()))


df_ookla['Country'] = df_ookla['Country'].str.strip()
df_ookla.rename(columns={'Country':'country_name', 'Mbps':'internet_speed'},inplace=True)
df_ookla = df_ookla[df_ookla['country_name'].isin(df_global_peace['country_name'])]
df_combined = df_global_peace.merge(df_ookla[['country_name', 'internet_speed']], on=['country_name'], how='left')

df_iata = df_iata.groupby('Territory').count()
df_iata = df_iata[['Airline Name']]
df_iata = df_iata.reset_index()
df_iata = df_iata.rename(columns={'Territory': 'country_name', 'Airline Name': 'num_airlines'})


df_combined = df_combined.merge(df_iata[['country_name', 'num_airlines']], on=['country_name'], how='left')
df_combined['internet_speed'] = df_combined.groupby('region_name')['internet_speed'].transform(lambda x: x.fillna(x.mean()))
df_combined['num_airlines'] = df_combined['num_airlines'].fillna(value=0)
df_combined['peace_inverted'] = 1 / df_combined['score']
df_combined.drop(columns=['ranking', 'score'], inplace=True)


In [23]:
df_visa.rename(columns={'Score':'visa_score', 'Country' : 'country_name'}, inplace=True)
df_combined = df_combined.merge(df_visa[['visa_score', 'country_name']], on='country_name', how='left')

In [43]:
# Select only relevant data
filtered_data = df_complete[
    (df_complete['indicator_ddt_name'].isin([
        'Total Expenditure on Inbound Tourism', 'Total Expenditure on Outbound Tourism',
        'Total International Arrivals',
        'Total Number of Establishments',  'Total Number of Rooms', 'Total Number of Bed Places',
        'Total (Employment)'
    ]))]
filtered_data = filtered_data.dropna(subset=['indicator_value'])
# Sort by country and indicator year in descending order
df_sorted = filtered_data.sort_values(by=['iso3_country_code', 'indicator_year'], ascending=[True, False])

# Drop duplicates to get the latest year per country
df_latest = df_sorted.drop_duplicates(subset=['iso3_country_code', 'indicator_ddt_name'], keep='first')


In [84]:
df_latest.query("""iso3_country_code == 'DZA'  """)

Unnamed: 0,iso3_country_code,indicator_year,indicator_value,indicator_ddt_name,indicator_source,indicator_ddt_cat,country_name,region_name,units
2138,DZA,2022,1398.0,Total International Arrivals,UNWTO,Inbound Tourism-Arrivals,Algeria,Northern Africa,Thousands
24421,DZA,2022,1576.0,Total Number of Establishments,UNWTO,Tourism Industries,Algeria,Northern Africa,Units
24422,DZA,2022,145526.0,Total Number of Bed Places,UNWTO,Tourism Industries,Algeria,Northern Africa,Units
25438,DZA,2022,219.1,Total Expenditure on Inbound Tourism,UNWTO,Inbound Tourism-Expenditure,Algeria,Northern Africa,US$ Millions
26684,DZA,2022,443.5,Total Expenditure on Outbound Tourism,UNWTO,Outbound Tourism-Expenditure,Algeria,Northern Africa,US$ Millions
25248,DZA,2019,320.0,Total (Employment),UNWTO,Employment,Algeria,Northern Africa,Thousands
20390,DZA,1997,32837.0,Total Number of Rooms,UNWTO,Tourism Industries,Algeria,Northern Africa,Units


In [44]:
df_latest.groupby('indicator_ddt_name')['indicator_year'].median()

indicator_ddt_name
Total (Employment)                       2019.5
Total Expenditure on Inbound Tourism     2022.0
Total Expenditure on Outbound Tourism    2022.0
Total International Arrivals             2020.0
Total Number of Bed Places               2021.0
Total Number of Establishments           2021.0
Total Number of Rooms                    2021.0
Name: indicator_year, dtype: float64

In [96]:
# Pivot the data to have years and types as columns
pivot_df = df_latest.pivot_table(
    index=['country_name', 'region_name', 'iso3_country_code'],
    columns=['indicator_ddt_name'],
    values='indicator_value'
).reset_index()

In [97]:
pivot_df = pivot_df.drop(columns=['Total (Employment)'])
pivot_df_fil = pivot_df.dropna(subset=['Total Number of Establishments'])

In [98]:
pivot_df_fil = pivot_df_fil.merge(df_combined[['iso3_country_code', 'internet_speed', 
                                'num_airlines', 'peace_inverted', 'visa_score']], on ='iso3_country_code', how= 'left')

# Clustering

In [99]:
pivot_df_fil.head()

Unnamed: 0,country_name,region_name,iso3_country_code,Total Expenditure on Inbound Tourism,Total Expenditure on Outbound Tourism,Total International Arrivals,Total Number of Bed Places,Total Number of Establishments,Total Number of Rooms,internet_speed,num_airlines,peace_inverted,visa_score
0,Algeria,Northern Africa,DZA,219.1,443.5,1398.0,145526.0,1576.0,32837.0,14.26,2.0,0.411015,0.113
1,Angola,Central Africa,AGO,24.3,1897.8,218000.0,18590.0,247.0,15728.0,18.58,1.0,0.490196,0.189
2,Benin,Western Africa,BEN,246.6,97.35,354.0,42245.0,1315.0,17959.0,22.48,0.0,0.433651,1.0
3,Botswana,Southern Africa,BWA,373.6,103.9,338.0,25330.0,933.0,12690.0,8.53,1.0,0.536769,0.321
4,Burkina Faso,Western Africa,BFA,144.8,151.9,67000.0,33184.0,1091.0,16847.0,45.8,0.0,0.336814,0.487


In [100]:
pivot_df_fil = pivot_df_fil.dropna()

In [101]:
from sklearn.cluster import KMeans

# Normalize the data - important for K-Means
scaler = StandardScaler()
cluster_df = pivot_df_fil.copy()
cluster_df.loc[cluster_df['iso3_country_code'] == 'TCD', 'Total Expenditure on Inbound Tourism'] /= 1000000
cluster_df.loc[cluster_df['iso3_country_code'] == 'TCD', 'Total Expenditure on Outbound Tourism'] /= 1000000

df_normalized = scaler.fit_transform(cluster_df.drop(columns=['iso3_country_code', 'region_name', 'country_name'], axis=1))

# Perform K-Means Clustering
kmeans = KMeans(n_clusters=4, random_state=42)
cluster_df['Cluster'] = kmeans.fit_predict(df_normalized)

# Mapping cluster numbers to descriptive categories
cluster_mapping = {
    2: 'Nascent Tourism Market',# can be undeveloped
    0: 'Emerging Tourism Industry', #Can be developing 
    #2: 'Industry with Potential',
    1: 'Established Tourism Industry',
    3: 'Advancing Tourism Economy' # can be Emerging rousim leader
}
cluster_df['Tourism Category'] = cluster_df['Cluster'].map(cluster_mapping)

# Creating a final DataFrame with country and tourism category
final_df = cluster_df[['iso3_country_code', 'Tourism Category']]

# Display the final DataFrame
final_df.head()


Unnamed: 0,iso3_country_code,Tourism Category
0,DZA,Advancing Tourism Economy
1,AGO,Advancing Tourism Economy
2,BEN,Emerging Tourism Industry
3,BWA,Advancing Tourism Economy
4,BFA,Nascent Tourism Market


In [104]:
cluster_df.head()

Unnamed: 0,country_name,region_name,iso3_country_code,Total Expenditure on Inbound Tourism,Total Expenditure on Outbound Tourism,Total International Arrivals,Total Number of Bed Places,Total Number of Establishments,Total Number of Rooms,internet_speed,num_airlines,peace_inverted,visa_score,Cluster,Tourism Category
0,Algeria,Northern Africa,DZA,219.1,443.5,1398.0,145526.0,1576.0,32837.0,14.26,2.0,0.411015,0.113,3,Advancing Tourism Economy
1,Angola,Central Africa,AGO,24.3,1897.8,218000.0,18590.0,247.0,15728.0,18.58,1.0,0.490196,0.189,3,Advancing Tourism Economy
2,Benin,Western Africa,BEN,246.6,97.35,354.0,42245.0,1315.0,17959.0,22.48,0.0,0.433651,1.0,0,Emerging Tourism Industry
3,Botswana,Southern Africa,BWA,373.6,103.9,338.0,25330.0,933.0,12690.0,8.53,1.0,0.536769,0.321,3,Advancing Tourism Economy
4,Burkina Faso,Western Africa,BFA,144.8,151.9,67000.0,33184.0,1091.0,16847.0,45.8,0.0,0.336814,0.487,2,Nascent Tourism Market


In [117]:
cluster_df.to_excel(r'../../datasets/new_platform/bubble/cluster_analysis.xlsx')

In [114]:
import plotly.io as pio
import plotly.express as px
#df_combined = pd.concat([df_gov_results[columns], df_ngos_results[columns], df_org_results[columns]], join='inner')

# Filter out rows where num_indicators is zero or missing
#df_combined = df_combined[df_combined['num_indicators'] > 0]

fig = px.treemap(cluster_df, 
                 path=['Tourism Category', 'country_name'],
                 values='Total Expenditure on Inbound Tourism',
                 color='region_name',
                 color_discrete_sequence=px.colors.qualitative.Pastel1
                 #color_continuous_scale='RdBu',
                 #hover_data=['num_indicators', 'recency']
                )
#fig.update_layout(title_text="Breakdown of Number of Indicators for Each Source, Colored for Recency", title_x=0.5)
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25),   font=dict(family="Futura, sans-serif", size=12, color="black"),
    coloraxis_colorbar=dict(title="Recency"))
#pio.write_image(fig, '/Users/dhruvpandit/Mirror/Documents/With Africa/Open Data Paper/treemap.png', width=1920, height=1080)
#fig.write_html('/Users/dhruvpandit/Mirror/Documents/With Africa/Open Data Paper/treemap.html')
fig.show()
