In [1]:
# Bar Chart visualization to analyse the number of visitors by gender for each country.

# This Bar Chart analyzes visitor patterns by gender for China in 2019 and South Korea in 2021.

In [4]:
import pandas as pd
import plotly.express as px

In [5]:
# Load the datasets
china_visitors = pd.read_csv('China_cleaned_Visitor_Arrivals_2019_Data.csv')
south_korea_age = pd.read_csv('South_Korea_visitor_Arrivalsby_age___by_nationality.csv')
south_korea_gender = pd.read_csv('South_Korea_visitor_Arrivalsby_gender___by_nationality.csv')

In [6]:
# Inspect the columns and the first few rows of each dataset
print("China Visitors Data Columns:")
print(china_visitors.columns)

print("\nChina Visitors Data Sample:")
print(china_visitors.head())

print("\nSouth Korea Visitors by Age Data Columns:")
print(south_korea_age.columns)

print("\nSouth Korea Visitors by Age Data Sample:")
print(south_korea_age.head())

print("\nSouth Korea Visitors by Gender Data Columns:")
print(south_korea_gender.columns)

print("\nSouth Korea Visitors by Gender Data Sample:")
print(south_korea_gender.head())

China Visitors Data Columns:
Index(['Item', '2019'], dtype='object')

China Visitors Data Sample:
           Item     2019
0         Total  4911.36
1          Male  2881.29
2        Female  2030.07
3  14 and under   184.92
4         15-24   686.20

South Korea Visitors by Age Data Columns:
Index(['Year', '2021', '2021.1', '2021.2', '2021.3', '2021.4', '2021.5',
       '2021.6', '2021.7'],
      dtype='object')

South Korea Visitors by Age Data Sample:
                Year   2021 2021.1 2021.2 2021.3 2021.4 2021.5       2021.6  \
0  By nationality(2)  Total   0-20  21-30  31-40  41-50  51-60  61 and Over   
1              Total  90150   5677  14043  12866   8002   6135         3456   
2              China  11691    341   1437   1902   1184   1323          669   
3              Japan   1007     86    235    173    184    123           58   
4             Taiwan    290      7     55     47     28     12           10   

  2021.7  
0   Crew  
1  39971  
2   4835  
3    148  
4    131  

So

In [7]:
# Check for missing values and data types
print("China Visitors Data Info:")
print(china_visitors.info())

print("\nSouth Korea Visitors by Gender Data Info:")
print(south_korea_gender.info())

# Standardize column names if necessary
china_visitors.columns = china_visitors.columns.str.strip().str.replace(' ', '_').str.lower()
south_korea_gender.columns = south_korea_gender.columns.str.strip().str.replace(' ', '_').str.lower()

print("Standardized China Visitors Data Columns:", china_visitors.columns)
print("Standardized South Korea Visitors by Gender Data Columns:", south_korea_gender.columns)

China Visitors Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Item    9 non-null      object 
 1   2019    9 non-null      float64
dtypes: float64(1), object(1)
memory usage: 276.0+ bytes
None

South Korea Visitors by Gender Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Year    63 non-null     object
 1   2021    63 non-null     object
 2   2021.1  63 non-null     object
 3   2021.2  63 non-null     object
dtypes: object(4)
memory usage: 2.1+ KB
None
Standardized China Visitors Data Columns: Index(['item', '2019'], dtype='object')
Standardized South Korea Visitors by Gender Data Columns: Index(['year', '2021', '2021.1', '2021.2'], dtype='object')


In [8]:
# Summarize the number of visitors by gender for each dataset
# Adjust the column names based on the actual data structure

# For China Visitors Data
china_gender_summary = china_visitors[china_visitors['item'].isin(['Male', 'Female'])].copy()
china_gender_summary.columns = ['gender', 'number_of_visitors']
china_gender_summary['country'] = 'China (2019)'

# For South Korea Visitors by Gender Data
south_korea_gender_summary = south_korea_gender[['year', '2021.1', '2021.2']].copy()
south_korea_gender_summary = south_korea_gender_summary[south_korea_gender_summary['year'] == 'Total']
south_korea_gender_summary = south_korea_gender_summary.melt(id_vars=['year'], value_vars=['2021.1', '2021.2'],
                                                             var_name='gender', value_name='number_of_visitors')
south_korea_gender_summary['gender'] = south_korea_gender_summary['gender'].replace({'2021.1': 'Male', '2021.2': 'Female'})
south_korea_gender_summary['country'] = 'South Korea (2021)'

# Combine the two summaries
combined_gender_summary = pd.concat([china_gender_summary, south_korea_gender_summary[['gender', 'number_of_visitors', 'country']]])

# Debug print statements
print("\nCombined Gender Summary:")
print(combined_gender_summary)


Combined Gender Summary:
   gender number_of_visitors             country
1    Male            2881.29        China (2019)
2  Female            2030.07        China (2019)
0    Male              32854  South Korea (2021)
1  Female              17325  South Korea (2021)


In [9]:
# Visualization
# Bar chart for gender comparison for both China and South Korea
fig_combined_gender = px.bar(combined_gender_summary, x='gender', y='number_of_visitors', color='country',
                             title='Visitor Arrivals by Gender for China (2019) and South Korea (2021)',
                             labels={'number_of_visitors': 'Number of Visitors', 'gender': 'Gender', 'country': 'Country'})

# Add annotations or labels to give context about the years
fig_combined_gender.update_layout(
    xaxis_title="Gender",
    yaxis_title="Number of Visitors",
    legend_title="Country",
    title_x=0.5
)

fig_combined_gender.show()