In [1]:
# Bar Chart visualization to analyse the number of visitors by age for each country.

# This Bar Chart analyzes visitor patterns by age for China in 2019 and South Korea in 2021.

In [2]:
import pandas as pd
import plotly.express as px

In [3]:
# Load the datasets
china_visitors = pd.read_csv('China_cleaned_Visitor_Arrivals_2019_Data.csv')
south_korea_age = pd.read_csv('South_Korea_visitor_Arrivalsby_age___by_nationality.csv')

In [4]:
# Inspect the columns and the first few rows of each dataset
print("China Visitors Data Columns:")
print(china_visitors.columns)

print("\nChina Visitors Data Sample:")
print(china_visitors.head())

print("\nSouth Korea Visitors by Age Data Columns:")
print(south_korea_age.columns)

print("\nSouth Korea Visitors by Age Data Sample:")
print(south_korea_age.head())

# Check for missing values and data types
print("China Visitors Data Info:")
print(china_visitors.info())

print("\nSouth Korea Visitors by Age Data Info:")
print(south_korea_age.info())

China Visitors Data Columns:
Index(['Item', '2019'], dtype='object')

China Visitors Data Sample:
           Item     2019
0         Total  4911.36
1          Male  2881.29
2        Female  2030.07
3  14 and under   184.92
4         15-24   686.20

South Korea Visitors by Age Data Columns:
Index(['Year', '2021', '2021.1', '2021.2', '2021.3', '2021.4', '2021.5',
       '2021.6', '2021.7'],
      dtype='object')

South Korea Visitors by Age Data Sample:
                Year   2021 2021.1 2021.2 2021.3 2021.4 2021.5       2021.6  \
0  By nationality(2)  Total   0-20  21-30  31-40  41-50  51-60  61 and Over   
1              Total  90150   5677  14043  12866   8002   6135         3456   
2              China  11691    341   1437   1902   1184   1323          669   
3              Japan   1007     86    235    173    184    123           58   
4             Taiwan    290      7     55     47     28     12           10   

  2021.7  
0   Crew  
1  39971  
2   4835  
3    148  
4    131  
Chi

In [5]:
# Standardize column names if necessary
china_visitors.columns = china_visitors.columns.str.strip().str.replace(' ', '_').str.lower()
south_korea_age.columns = south_korea_age.columns.str.strip().str.replace(' ', '_').str.lower()

print("Standardized China Visitors Data Columns:", china_visitors.columns)
print("Standardized South Korea Visitors Data Columns:", south_korea_age.columns)

Standardized China Visitors Data Columns: Index(['item', '2019'], dtype='object')
Standardized South Korea Visitors Data Columns: Index(['year', '2021', '2021.1', '2021.2', '2021.3', '2021.4', '2021.5',
       '2021.6', '2021.7'],
      dtype='object')


In [6]:
# Ensure "65 and Over" is included for China
if '65 and over' not in china_visitors['item'].values:
    china_visitors = pd.concat([china_visitors, pd.DataFrame({'item': ['65 and over'], '2019': [234.77]})], ignore_index=True)

In [7]:
# Summarize the age distribution for each dataset
# For China Visitors Data
china_age_summary = china_visitors[china_visitors['item'].isin(['14 and under', '15-24', '25-44', '45-64', '65 and over'])].copy()
china_age_summary.columns = ['age_group', 'number_of_visitors']
china_age_summary['country'] = 'China (2019)'

# Debug: Check if "65 and over" is included
print("\nChina Age Summary (Check for '65 and over'):")
print(china_age_summary[china_age_summary['age_group'] == '65 and over'])

# For South Korea Visitors by Age Data
south_korea_age_summary = south_korea_age[south_korea_age['year'] == 'Total'].copy()
south_korea_age_summary = south_korea_age_summary.drop(columns=['2021'])  # Drop the total number


China Age Summary (Check for '65 and over'):
     age_group  number_of_visitors       country
9  65 and over              234.77  China (2019)


In [8]:
# Melt the DataFrame
south_korea_age_summary = south_korea_age_summary.melt(id_vars=['year'], value_vars=['2021.1', '2021.2', '2021.3', '2021.4', '2021.5', '2021.6', '2021.7'],
                                                       var_name='age_group', value_name='number_of_visitors')

# Define the mapping from South Korea age groups to their original ranges
age_group_mapping_sk = {
    '2021.1': '0-20',
    '2021.2': '21-30',
    '2021.3': '31-40',
    '2021.4': '41-50',
    '2021.5': '51-60',
    '2021.6': '61 and Over',
    '2021.7': '61 and Over'
}

# Apply the mapping to South Korea age groups
south_korea_age_summary['age_group'] = south_korea_age_summary['age_group'].map(age_group_mapping_sk)
south_korea_age_summary['country'] = 'South Korea (2021)'

# Convert the number_of_visitors column to numeric
south_korea_age_summary['number_of_visitors'] = pd.to_numeric(south_korea_age_summary['number_of_visitors'], errors='coerce')

# Correct the "61 and Over" values aggregation
south_korea_age_summary = south_korea_age_summary.groupby(['age_group', 'country']).sum().reset_index()

# Debug: Check if "61 and Over" is correctly aggregated
print("\nSouth Korea Age Summary (Check for '61 and Over'):")
print(south_korea_age_summary[south_korea_age_summary['age_group'] == '61 and Over'])


South Korea Age Summary (Check for '61 and Over'):
     age_group             country        year  number_of_visitors
5  61 and Over  South Korea (2021)  TotalTotal               43427


In [9]:
# Combine the two summaries
combined_age_summary = pd.concat([china_age_summary, south_korea_age_summary])

# Ensure the age groups are sorted logically for each country
china_age_group_order = ['14 and under', '15-24', '25-44', '45-64', '65 and over']
south_korea_age_group_order = ['0-20', '21-30', '31-40', '41-50', '51-60', '61 and Over']

combined_age_summary['age_group'] = pd.Categorical(combined_age_summary['age_group'], categories=china_age_group_order + south_korea_age_group_order, ordered=True)
combined_age_summary = combined_age_summary.sort_values(['country', 'age_group'])

# Check for any missing age groups
missing_china_ages = set(china_age_group_order) - set(china_age_summary['age_group'])
missing_sk_ages = set(south_korea_age_group_order) - set(south_korea_age_summary['age_group'])

if missing_china_ages:
    for age_group in missing_china_ages:
        combined_age_summary = pd.concat([combined_age_summary, pd.DataFrame({'age_group': [age_group], 'number_of_visitors': [0], 'country': ['China (2019)']})])

if missing_sk_ages:
    for age_group in missing_sk_ages:
        combined_age_summary = pd.concat([combined_age_summary, pd.DataFrame({'age_group': [age_group], 'number_of_visitors': [0], 'country': ['South Korea (2021)']})])

# Debug: Check the combined age summary for completeness
print("\nCombined Age Summary after adding missing groups:")
print(combined_age_summary)


Combined Age Summary after adding missing groups:
      age_group  number_of_visitors             country        year
3  14 and under              184.92        China (2019)         NaN
4         15-24              686.20        China (2019)         NaN
5         25-44             2439.71        China (2019)         NaN
6         45-64             1365.75        China (2019)         NaN
9   65 and over              234.77        China (2019)         NaN
0          0-20             5677.00  South Korea (2021)       Total
1         21-30            14043.00  South Korea (2021)       Total
2         31-40            12866.00  South Korea (2021)       Total
3         41-50             8002.00  South Korea (2021)       Total
4         51-60             6135.00  South Korea (2021)       Total
5   61 and Over            43427.00  South Korea (2021)  TotalTotal


In [10]:
# Prepare the data for plotting
fig = px.bar(combined_age_summary, x='age_group', y='number_of_visitors', color='country', barmode='group',
             title='Visitor Age Distribution for China (2019) and South Korea (2021)',
             labels={'number_of_visitors': 'Number of Visitors', 'age_group': 'Age Group', 'country': 'Country'})

# Add annotations or labels to give context about the years
fig.update_layout(
    xaxis_title="Age Group",
    yaxis_title="Number of Visitors",
    legend_title="Country",
    title_x=0.5
)

fig.show()