In [17]:
# Question 3: Seasonal Tourism Patterns Preprocessing
# Are certain regions more affected by seasonal tourism trends than others?

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

print("Libraries imported successfully!")


Libraries imported successfully!


In [18]:
# Load the trends data
df_trends = pd.read_csv('../TouristArrival/trends.csv')

print("Data loaded successfully!")
print(f"\nDataset shape: {df_trends.shape}")
print(f"\nFirst few rows:")
df_trends.head()


Data loaded successfully!

Dataset shape: (104, 15)

First few rows:


Unnamed: 0,date,Paris,Barcelona,Tokyo,New York,London,Rome,Amsterdam,Sydney,Bangkok,Istanbul,Cairo,Rio de Janeiro,Venice,Los Angeles
0,2023-12-31,0.36,0.34,0.34,0.33,0.47,0.57,0.56,0.56,0.5,0.31,0.24,0.21,0.66,0.33
1,2024-01-07,0.35,0.34,0.29,0.33,0.42,0.56,0.46,0.49,0.46,0.29,0.3,0.22,0.55,0.29
2,2024-01-14,0.36,0.35,0.32,0.27,0.41,0.54,0.51,0.42,0.41,0.34,0.3,0.2,0.58,0.29
3,2024-01-21,0.34,0.37,0.28,0.28,0.42,0.53,0.45,0.42,0.42,0.28,0.26,0.2,0.57,0.27
4,2024-01-28,0.37,0.37,0.3,0.26,0.43,0.55,0.47,0.41,0.42,0.32,0.22,0.18,0.59,0.28


In [19]:
# Check data info and missing values
print("Data Info:")
print(df_trends.info())
print("\n" + "="*50)
print("\nMissing Values:")
print(df_trends.isnull().sum())
print("\n" + "="*50)
print("\nData Statistics:")
df_trends.describe()


Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            104 non-null    object 
 1   Paris           104 non-null    float64
 2   Barcelona       104 non-null    float64
 3   Tokyo           104 non-null    float64
 4   New York        104 non-null    float64
 5   London          104 non-null    float64
 6   Rome            104 non-null    float64
 7   Amsterdam       104 non-null    float64
 8   Sydney          104 non-null    float64
 9   Bangkok         104 non-null    float64
 10  Istanbul        104 non-null    float64
 11  Cairo           104 non-null    float64
 12  Rio de Janeiro  104 non-null    float64
 13  Venice          104 non-null    float64
 14  Los Angeles     104 non-null    float64
dtypes: float64(14), object(1)
memory usage: 12.3+ KB
None


Missing Values:
date              0
Paris             0
Ba

Unnamed: 0,Paris,Barcelona,Tokyo,New York,London,Rome,Amsterdam,Sydney,Bangkok,Istanbul,Cairo,Rio de Janeiro,Venice,Los Angeles
count,104.0,104.0,104.0,104.0,104.0,104.0,104.0,104.0,104.0,104.0,104.0,104.0,104.0,104.0
mean,0.45625,0.413077,0.423269,0.374231,0.496442,0.617788,0.489904,0.467788,0.473846,0.366538,0.359519,0.290288,0.597019,0.361827
std,0.163221,0.15855,0.152808,0.171054,0.13451,0.117018,0.118498,0.105995,0.127822,0.15837,0.210036,0.176489,0.114348,0.162569
min,0.3,0.25,0.28,0.24,0.37,0.38,0.32,0.35,0.34,0.21,0.15,0.11,0.34,0.22
25%,0.36,0.33,0.33,0.27,0.4175,0.5575,0.4275,0.4,0.39,0.28,0.24,0.19,0.54,0.27
50%,0.39,0.36,0.35,0.3,0.44,0.59,0.45,0.42,0.43,0.315,0.27,0.22,0.58,0.29
75%,0.445,0.39,0.43,0.36,0.48,0.66,0.51,0.53,0.4925,0.3525,0.34,0.28,0.65,0.3875
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Step 2: Reshape Data

Convert dataset from wide format to long format with columns: **date | city | tourism_index**

**Why:** Required for time-series aggregation and grouping


In [20]:
# Step 2: Reshape data from wide to long format
# Convert from: date | Paris | Barcelona | Tokyo | ... (wide format)
# To: date | city | tourism_index (long format)

# First, ensure date is datetime
df_trends['date'] = pd.to_datetime(df_trends['date'])

# Get all city columns (all columns except 'date')
city_columns = [col for col in df_trends.columns if col != 'date']

# Melt the dataframe to long format
df_long = pd.melt(
    df_trends,
    id_vars=['date'],
    value_vars=city_columns,
    var_name='city',
    value_name='tourism_index'
)

# Sort by date and city for better organization
df_long = df_long.sort_values(['date', 'city']).reset_index(drop=True)

print("Data reshaped successfully!")
print(f"\nOriginal shape (wide): {df_trends.shape}")
print(f"New shape (long): {df_long.shape}")
print(f"\nColumns: {list(df_long.columns)}")
print(f"\nFirst 15 rows:")
print(df_long.head(15))
print(f"\nLast 15 rows:")
print(df_long.tail(15))


Data reshaped successfully!

Original shape (wide): (104, 15)
New shape (long): (1456, 3)

Columns: ['date', 'city', 'tourism_index']

First 15 rows:
         date            city  tourism_index
0  2023-12-31       Amsterdam           0.56
1  2023-12-31         Bangkok           0.50
2  2023-12-31       Barcelona           0.34
3  2023-12-31           Cairo           0.24
4  2023-12-31        Istanbul           0.31
5  2023-12-31          London           0.47
6  2023-12-31     Los Angeles           0.33
7  2023-12-31        New York           0.33
8  2023-12-31           Paris           0.36
9  2023-12-31  Rio de Janeiro           0.21
10 2023-12-31            Rome           0.57
11 2023-12-31          Sydney           0.56
12 2023-12-31           Tokyo           0.34
13 2023-12-31          Venice           0.66
14 2024-01-07       Amsterdam           0.46

Last 15 rows:
           date            city  tourism_index
1441 2025-12-14          Venice           0.38
1442 2025-12-21      

In [21]:
# Verify the reshaped data structure
print("Data Structure Verification:")
print(f"\n1. Column names: {list(df_long.columns)}")
print(f"\n2. Data types:")
print(df_long.dtypes)
print(f"\n3. Number of unique cities: {df_long['city'].nunique()}")
print(f"   Cities: {sorted(df_long['city'].unique())}")
print(f"\n4. Date range: {df_long['date'].min()} to {df_long['date'].max()}")
print(f"   Total dates: {df_long['date'].nunique()}")
print(f"\n5. Total records: {len(df_long)}")
print(f"   Expected: {df_long['date'].nunique()} dates × {df_long['city'].nunique()} cities = {df_long['date'].nunique() * df_long['city'].nunique()}")
print(f"\n6. Missing values:")
print(df_long.isnull().sum())
print(f"\n7. Tourism index statistics:")
print(df_long['tourism_index'].describe())


Data Structure Verification:

1. Column names: ['date', 'city', 'tourism_index']

2. Data types:
date             datetime64[ns]
city                     object
tourism_index           float64
dtype: object

3. Number of unique cities: 14
   Cities: ['Amsterdam', 'Bangkok', 'Barcelona', 'Cairo', 'Istanbul', 'London', 'Los Angeles', 'New York', 'Paris', 'Rio de Janeiro', 'Rome', 'Sydney', 'Tokyo', 'Venice']

4. Date range: 2023-12-31 00:00:00 to 2025-12-21 00:00:00
   Total dates: 104

5. Total records: 1456
   Expected: 104 dates × 14 cities = 1456

6. Missing values:
date             0
city             0
tourism_index    0
dtype: int64

7. Tourism index statistics:
count    1456.000000
mean        0.441985
std         0.174166
min         0.110000
25%         0.310000
50%         0.410000
75%         0.550000
max         1.000000
Name: tourism_index, dtype: float64


In [22]:
# Extract temporal features for seasonal analysis
# Add temporal features to the long format data
df_long['year'] = df_long['date'].dt.year
df_long['month'] = df_long['date'].dt.month
df_long['quarter'] = df_long['date'].dt.quarter
df_long['day_of_year'] = df_long['date'].dt.dayofyear
df_long['week_of_year'] = df_long['date'].dt.isocalendar().week

print("Temporal features added to long format data!")
print(f"\nDate range: {df_long['date'].min()} to {df_long['date'].max()}")
print(f"\nColumns: {list(df_long.columns)}")
print(f"\nSample with new features:")
df_long[['date', 'city', 'tourism_index', 'year', 'month', 'quarter', 'week_of_year']].head(10)


Temporal features added to long format data!

Date range: 2023-12-31 00:00:00 to 2025-12-21 00:00:00

Columns: ['date', 'city', 'tourism_index', 'year', 'month', 'quarter', 'day_of_year', 'week_of_year']

Sample with new features:


Unnamed: 0,date,city,tourism_index,year,month,quarter,week_of_year
0,2023-12-31,Amsterdam,0.56,2023,12,4,52
1,2023-12-31,Bangkok,0.5,2023,12,4,52
2,2023-12-31,Barcelona,0.34,2023,12,4,52
3,2023-12-31,Cairo,0.24,2023,12,4,52
4,2023-12-31,Istanbul,0.31,2023,12,4,52
5,2023-12-31,London,0.47,2023,12,4,52
6,2023-12-31,Los Angeles,0.33,2023,12,4,52
7,2023-12-31,New York,0.33,2023,12,4,52
8,2023-12-31,Paris,0.36,2023,12,4,52
9,2023-12-31,Rio de Janeiro,0.21,2023,12,4,52


In [23]:
# Map cities to continents
# Define city to continent mapping
city_to_continent = {
    # Europe
    'Paris': 'Europe',
    'Barcelona': 'Europe',
    'London': 'Europe',
    'Rome': 'Europe',
    'Amsterdam': 'Europe',
    'Venice': 'Europe',
    
    # Asia
    'Tokyo': 'Asia',
    'Bangkok': 'Asia',
    'Istanbul': 'Asia',  # Transcontinental, but primarily in Asia
    
    # North America
    'New York': 'North America',
    'Los Angeles': 'North America',
    
    # South America
    'Rio de Janeiro': 'South America',
    
    # Africa
    'Cairo': 'Africa',
    
    # Oceania
    'Sydney': 'Oceania'
}

print("City to Continent Mapping:")
for city, continent in city_to_continent.items():
    print(f"{city:20s} -> {continent}")

# Add continent column to df_long
df_long['continent'] = df_long['city'].map(city_to_continent)

# Verify all cities in the dataset are mapped
cities_in_data = df_long['city'].unique()
print(f"\nCities in dataset: {sorted(cities_in_data)}")
print(f"\nAll cities mapped: {all(city in city_to_continent for city in cities_in_data)}")
print(f"\nSample data with continent:")
df_long[['date', 'city', 'continent', 'tourism_index']].head(10)


City to Continent Mapping:
Paris                -> Europe
Barcelona            -> Europe
London               -> Europe
Rome                 -> Europe
Amsterdam            -> Europe
Venice               -> Europe
Tokyo                -> Asia
Bangkok              -> Asia
Istanbul             -> Asia
New York             -> North America
Los Angeles          -> North America
Rio de Janeiro       -> South America
Cairo                -> Africa
Sydney               -> Oceania

Cities in dataset: ['Amsterdam', 'Bangkok', 'Barcelona', 'Cairo', 'Istanbul', 'London', 'Los Angeles', 'New York', 'Paris', 'Rio de Janeiro', 'Rome', 'Sydney', 'Tokyo', 'Venice']

All cities mapped: True

Sample data with continent:


Unnamed: 0,date,city,continent,tourism_index
0,2023-12-31,Amsterdam,Europe,0.56
1,2023-12-31,Bangkok,Asia,0.5
2,2023-12-31,Barcelona,Europe,0.34
3,2023-12-31,Cairo,Africa,0.24
4,2023-12-31,Istanbul,Asia,0.31
5,2023-12-31,London,Europe,0.47
6,2023-12-31,Los Angeles,North America,0.33
7,2023-12-31,New York,North America,0.33
8,2023-12-31,Paris,Europe,0.36
9,2023-12-31,Rio de Janeiro,South America,0.21


In [24]:
# Data is already in long format from Step 2
# Verify the current structure
print("Current data structure (already in long format):")
print(f"\nColumns: {list(df_long.columns)}")
print(f"\nShape: {df_long.shape}")
print(f"\nSample data:")
df_long.head(10)


Current data structure (already in long format):

Columns: ['date', 'city', 'tourism_index', 'year', 'month', 'quarter', 'day_of_year', 'week_of_year', 'continent']

Shape: (1456, 9)

Sample data:


Unnamed: 0,date,city,tourism_index,year,month,quarter,day_of_year,week_of_year,continent
0,2023-12-31,Amsterdam,0.56,2023,12,4,365,52,Europe
1,2023-12-31,Bangkok,0.5,2023,12,4,365,52,Asia
2,2023-12-31,Barcelona,0.34,2023,12,4,365,52,Europe
3,2023-12-31,Cairo,0.24,2023,12,4,365,52,Africa
4,2023-12-31,Istanbul,0.31,2023,12,4,365,52,Asia
5,2023-12-31,London,0.47,2023,12,4,365,52,Europe
6,2023-12-31,Los Angeles,0.33,2023,12,4,365,52,North America
7,2023-12-31,New York,0.33,2023,12,4,365,52,North America
8,2023-12-31,Paris,0.36,2023,12,4,365,52,Europe
9,2023-12-31,Rio de Janeiro,0.21,2023,12,4,365,52,South America


In [25]:
# Check for missing values in the long format
print("Missing values check:")
print(df_long.isnull().sum())
print(f"\nTotal missing values: {df_long.isnull().sum().sum()}")

# Check for any invalid tourism_index values (should be between 0 and 1 based on normalization)
print(f"\nTourism index statistics:")
print(df_long['tourism_index'].describe())
print(f"\nValues outside [0, 1] range: {((df_long['tourism_index'] < 0) | (df_long['tourism_index'] > 1)).sum()}")

# Handle any missing values if present (forward fill for time series)
if df_long['tourism_index'].isnull().sum() > 0:
    print("\nFilling missing values using forward fill...")
    df_long = df_long.sort_values(['city', 'date'])
    df_long['tourism_index'] = df_long.groupby('city')['tourism_index'].ffill()
    df_long['tourism_index'] = df_long.groupby('city')['tourism_index'].bfill()
    print("Missing values handled!")


Missing values check:
date             0
city             0
tourism_index    0
year             0
month            0
quarter          0
day_of_year      0
week_of_year     0
continent        0
dtype: int64

Total missing values: 0

Tourism index statistics:
count    1456.000000
mean        0.441985
std         0.174166
min         0.110000
25%         0.310000
50%         0.410000
75%         0.550000
max         1.000000
Name: tourism_index, dtype: float64

Values outside [0, 1] range: 0


In [26]:
# Split data by continents
continents = df_long['continent'].unique()
continent_data = {}

for continent in continents:
    continent_data[continent] = df_long[df_long['continent'] == continent].copy()
    print(f"\n{continent}:")
    print(f"  - Number of cities: {continent_data[continent]['city'].nunique()}")
    print(f"  - Cities: {list(continent_data[continent]['city'].unique())}")
    print(f"  - Number of records: {len(continent_data[continent])}")
    print(f"  - Date range: {continent_data[continent]['date'].min()} to {continent_data[continent]['date'].max()}")

print(f"\n\nTotal continents: {len(continents)}")
print(f"Continents: {sorted(continents)}")



Europe:
  - Number of cities: 6
  - Cities: ['Amsterdam', 'Barcelona', 'London', 'Paris', 'Rome', 'Venice']
  - Number of records: 624
  - Date range: 2023-12-31 00:00:00 to 2025-12-21 00:00:00

Asia:
  - Number of cities: 3
  - Cities: ['Bangkok', 'Istanbul', 'Tokyo']
  - Number of records: 312
  - Date range: 2023-12-31 00:00:00 to 2025-12-21 00:00:00

Africa:
  - Number of cities: 1
  - Cities: ['Cairo']
  - Number of records: 104
  - Date range: 2023-12-31 00:00:00 to 2025-12-21 00:00:00

North America:
  - Number of cities: 2
  - Cities: ['Los Angeles', 'New York']
  - Number of records: 208
  - Date range: 2023-12-31 00:00:00 to 2025-12-21 00:00:00

South America:
  - Number of cities: 1
  - Cities: ['Rio de Janeiro']
  - Number of records: 104
  - Date range: 2023-12-31 00:00:00 to 2025-12-21 00:00:00

Oceania:
  - Number of cities: 1
  - Cities: ['Sydney']
  - Number of records: 104
  - Date range: 2023-12-31 00:00:00 to 2025-12-21 00:00:00


Total continents: 6
Continents: ['

In [27]:
# Calculate seasonal statistics by continent
# This will help identify which regions are more affected by seasonal trends

seasonal_stats = []

for continent in continents:
    continent_df = continent_data[continent]
    
    # Calculate statistics
    stats = {
        'continent': continent,
        'num_cities': continent_df['city'].nunique(),
        'mean_trend': continent_df['tourism_index'].mean(),
        'std_trend': continent_df['tourism_index'].std(),
        'min_trend': continent_df['tourism_index'].min(),
        'max_trend': continent_df['tourism_index'].max(),
        'trend_range': continent_df['tourism_index'].max() - continent_df['tourism_index'].min(),
        'coefficient_of_variation': (continent_df['tourism_index'].std() / continent_df['tourism_index'].mean()) * 100
    }
    
    # Calculate seasonal variation (difference between max and min by month)
    monthly_stats = continent_df.groupby('month')['tourism_index'].agg(['mean', 'std', 'min', 'max'])
    stats['monthly_range'] = monthly_stats['max'].max() - monthly_stats['min'].min()
    stats['monthly_std_mean'] = monthly_stats['std'].mean()
    
    # Calculate quarterly variation
    quarterly_stats = continent_df.groupby('quarter')['tourism_index'].agg(['mean', 'std', 'min', 'max'])
    stats['quarterly_range'] = quarterly_stats['max'].max() - quarterly_stats['min'].min()
    
    seasonal_stats.append(stats)

df_seasonal_stats = pd.DataFrame(seasonal_stats)
df_seasonal_stats = df_seasonal_stats.sort_values('coefficient_of_variation', ascending=False)

print("Seasonal Statistics by Continent:")
print("="*80)
print(df_seasonal_stats.to_string(index=False))


Seasonal Statistics by Continent:
    continent  num_cities  mean_trend  std_trend  min_trend  max_trend  trend_range  coefficient_of_variation  monthly_range  monthly_std_mean  quarterly_range
South America           1    0.290288   0.176489       0.11        1.0         0.89                 60.797891           0.89          0.119570             0.89
       Africa           1    0.359519   0.210036       0.15        1.0         0.85                 58.421472           0.85          0.136314             0.85
North America           2    0.368029   0.166578       0.22        1.0         0.78                 45.262245           0.78          0.105501             0.78
         Asia           3    0.421218   0.152901       0.21        1.0         0.79                 36.299627           0.79          0.111480             0.79
       Europe           6    0.511747   0.153780       0.25        1.0         0.75                 30.049928           0.75          0.131184             0.75
      

In [28]:
# Create aggregated dataframes for each continent (monthly averages)
continent_monthly_avg = {}

for continent in continents:
    continent_df = continent_data[continent]
    
    # Calculate monthly average tourism_index values for each continent
    monthly_avg = continent_df.groupby(['year', 'month', 'continent'])['tourism_index'].mean().reset_index()
    monthly_avg['date'] = pd.to_datetime(monthly_avg[['year', 'month']].assign(day=1))
    
    continent_monthly_avg[continent] = monthly_avg.sort_values('date')
    
    print(f"\n{continent} - Monthly Average Trends:")
    print(continent_monthly_avg[continent].head(10))

# Combine all continent monthly averages
df_continent_monthly = pd.concat(continent_monthly_avg.values(), ignore_index=True)
df_continent_monthly = df_continent_monthly.sort_values(['continent', 'date'])

print(f"\n\nCombined continent monthly data shape: {df_continent_monthly.shape}")
df_continent_monthly.head(15)



Europe - Monthly Average Trends:
   year  month continent  tourism_index       date
0  2023     12    Europe       0.493333 2023-12-01
1  2024      1    Europe       0.453750 2024-01-01
2  2024      2    Europe       0.465417 2024-02-01
3  2024      3    Europe       0.472000 2024-03-01
4  2024      4    Europe       0.490000 2024-04-01
5  2024      5    Europe       0.482500 2024-05-01
6  2024      6    Europe       0.464333 2024-06-01
7  2024      7    Europe       0.477083 2024-07-01
8  2024      8    Europe       0.462917 2024-08-01
9  2024      9    Europe       0.438667 2024-09-01

Asia - Monthly Average Trends:
   year  month continent  tourism_index       date
0  2023     12      Asia       0.383333 2023-12-01
1  2024      1      Asia       0.344167 2024-01-01
2  2024      2      Asia       0.336667 2024-02-01
3  2024      3      Asia       0.353333 2024-03-01
4  2024      4      Asia       0.355000 2024-04-01
5  2024      5      Asia       0.370000 2024-05-01
6  2024      6  


North America - Monthly Average Trends:
   year  month      continent  tourism_index       date
0  2023     12  North America        0.33000 2023-12-01
1  2024      1  North America        0.28375 2024-01-01
2  2024      2  North America        0.28250 2024-02-01
3  2024      3  North America        0.27600 2024-03-01
4  2024      4  North America        0.27625 2024-04-01
5  2024      5  North America        0.29875 2024-05-01
6  2024      6  North America        0.29500 2024-06-01
7  2024      7  North America        0.30000 2024-07-01
8  2024      8  North America        0.29875 2024-08-01
9  2024      9  North America        0.29000 2024-09-01

South America - Monthly Average Trends:
   year  month      continent  tourism_index       date
0  2023     12  South America         0.2100 2023-12-01
1  2024      1  South America         0.2000 2024-01-01
2  2024      2  South America         0.1925 2024-02-01
3  2024      3  South America         0.1960 2024-03-01
4  2024      4  South 

Unnamed: 0,year,month,continent,tourism_index,date
50,2023,12,Africa,0.24,2023-12-01
51,2024,1,Africa,0.27,2024-01-01
52,2024,2,Africa,0.2575,2024-02-01
53,2024,3,Africa,0.232,2024-03-01
54,2024,4,Africa,0.2475,2024-04-01
55,2024,5,Africa,0.23,2024-05-01
56,2024,6,Africa,0.198,2024-06-01
57,2024,7,Africa,0.2325,2024-07-01
58,2024,8,Africa,0.22,2024-08-01
59,2024,9,Africa,0.252,2024-09-01


In [29]:
# Create city-level monthly averages for detailed analysis
city_monthly_avg = df_long.groupby(['city', 'continent', 'year', 'month'])['tourism_index'].mean().reset_index()
city_monthly_avg['date'] = pd.to_datetime(city_monthly_avg[['year', 'month']].assign(day=1))
city_monthly_avg = city_monthly_avg.sort_values(['continent', 'city', 'date'])

print("City-level Monthly Averages:")
print(f"Shape: {city_monthly_avg.shape}")
city_monthly_avg.head(20)


City-level Monthly Averages:
Shape: (350, 6)


Unnamed: 0,city,continent,year,month,tourism_index,date
75,Cairo,Africa,2023,12,0.24,2023-12-01
76,Cairo,Africa,2024,1,0.27,2024-01-01
77,Cairo,Africa,2024,2,0.2575,2024-02-01
78,Cairo,Africa,2024,3,0.232,2024-03-01
79,Cairo,Africa,2024,4,0.2475,2024-04-01
80,Cairo,Africa,2024,5,0.23,2024-05-01
81,Cairo,Africa,2024,6,0.198,2024-06-01
82,Cairo,Africa,2024,7,0.2325,2024-07-01
83,Cairo,Africa,2024,8,0.22,2024-08-01
84,Cairo,Africa,2024,9,0.252,2024-09-01


In [30]:
# Calculate seasonal variation metrics for each continent
# This measures how much each region is affected by seasonal trends

seasonal_variation = []

for continent in continents:
    continent_df = continent_data[continent]
    
    # Calculate coefficient of variation for each month
    monthly_cv = continent_df.groupby('month')['tourism_index'].apply(
        lambda x: (x.std() / x.mean() * 100) if x.mean() > 0 else 0
    )
    
    # Calculate peak and low seasons
    monthly_avg = continent_df.groupby('month')['tourism_index'].mean()
    peak_month = monthly_avg.idxmax()
    low_month = monthly_avg.idxmin()
    peak_value = monthly_avg.max()
    low_value = monthly_avg.min()
    
    variation = {
        'continent': continent,
        'avg_monthly_cv': monthly_cv.mean(),
        'max_monthly_cv': monthly_cv.max(),
        'peak_season_month': peak_month,
        'low_season_month': low_month,
        'peak_value': peak_value,
        'low_value': low_value,
        'seasonal_amplitude': peak_value - low_value,
        'seasonal_amplitude_pct': ((peak_value - low_value) / low_value * 100) if low_value > 0 else 0
    }
    
    seasonal_variation.append(variation)

df_seasonal_variation = pd.DataFrame(seasonal_variation)
df_seasonal_variation = df_seasonal_variation.sort_values('seasonal_amplitude_pct', ascending=False)

print("Seasonal Variation Metrics by Continent:")
print("="*100)
print(df_seasonal_variation.to_string(index=False))


Seasonal Variation Metrics by Continent:
    continent  avg_monthly_cv  max_monthly_cv  peak_season_month  low_season_month  peak_value  low_value  seasonal_amplitude  seasonal_amplitude_pct
       Africa       32.104205       59.648856                 11                 6    0.601111   0.212000            0.389111              183.542977
South America       35.837581       63.055863                  8                 4    0.497778   0.187500            0.310278              165.481481
North America       24.823579       49.570381                  8                 2    0.577222   0.266250            0.310972              116.797079
         Asia       24.764498       43.810358                  8                 6    0.594444   0.336000            0.258444               76.917989
       Europe       25.215465       35.555095                  8                12    0.698519   0.448889            0.249630               55.610561
      Oceania       15.716906       34.639475              

In [31]:
# Summary of preprocessing steps
print("="*80)
print("PREPROCESSING SUMMARY")
print("="*80)
print("\n1. Data Loading:")
print(f"   - Loaded {df_trends.shape[0]} rows and {df_trends.shape[1]} columns")
print(f"   - Date range: {df_trends['date'].min()} to {df_trends['date'].max()}")

print("\n2. Data Transformation (Step 2 - Reshape):")
print(f"   - Converted to long format: {df_long.shape[0]} rows")
print(f"   - Format: date | city | tourism_index")
print(f"   - Added temporal features: year, month, quarter, week_of_year")
print(f"   - Mapped {len(city_to_continent)} cities to {len(continents)} continents")

print("\n3. Data Quality:")
print(f"   - Missing values handled: {df_long.isnull().sum().sum() == 0}")
print(f"   - Tourism index values range: [{df_long['tourism_index'].min():.3f}, {df_long['tourism_index'].max():.3f}]")

print("\n4. Data Split by Continents:")
for continent in sorted(continents):
    cities = continent_data[continent]['city'].unique()
    print(f"   - {continent}: {len(cities)} cities - {', '.join(cities)}")

print("\n5. Created Analysis-Ready Datasets:")
print("   - df_long: Long format with all data points (date | city | tourism_index)")
print("   - continent_data: Dictionary with data split by continent")
print("   - df_continent_monthly: Monthly averages by continent")
print("   - city_monthly_avg: Monthly averages by city")
print("   - df_seasonal_stats: Statistical summary by continent")
print("   - df_seasonal_variation: Seasonal variation metrics by continent")

print("\n" + "="*80)
print("Preprocessing complete! Data is ready for seasonal pattern analysis.")
print("="*80)


PREPROCESSING SUMMARY

1. Data Loading:
   - Loaded 104 rows and 15 columns
   - Date range: 2023-12-31 00:00:00 to 2025-12-21 00:00:00

2. Data Transformation (Step 2 - Reshape):
   - Converted to long format: 1456 rows
   - Format: date | city | tourism_index
   - Added temporal features: year, month, quarter, week_of_year
   - Mapped 14 cities to 6 continents

3. Data Quality:
   - Missing values handled: True
   - Tourism index values range: [0.110, 1.000]

4. Data Split by Continents:
   - Africa: 1 cities - Cairo
   - Asia: 3 cities - Bangkok, Istanbul, Tokyo
   - Europe: 6 cities - Amsterdam, Barcelona, London, Paris, Rome, Venice
   - North America: 2 cities - Los Angeles, New York
   - Oceania: 1 cities - Sydney
   - South America: 1 cities - Rio de Janeiro

5. Created Analysis-Ready Datasets:
   - df_long: Long format with all data points (date | city | tourism_index)
   - continent_data: Dictionary with data split by continent
   - df_continent_monthly: Monthly averages by c

In [32]:
# Save all preprocessed datasets as CSV files for use in EDA notebook
import os

# Create output directory if it doesn't exist
output_dir = 'Question3/preprocessed_data'
os.makedirs(output_dir, exist_ok=True)

# Save main datasets
df_long.to_csv(f'{output_dir}/df_long.csv', index=False)
df_continent_monthly.to_csv(f'{output_dir}/df_continent_monthly.csv', index=False)
city_monthly_avg.to_csv(f'{output_dir}/city_monthly_avg.csv', index=False)
df_seasonal_stats.to_csv(f'{output_dir}/df_seasonal_stats.csv', index=False)
df_seasonal_variation.to_csv(f'{output_dir}/df_seasonal_variation.csv', index=False)

# Save continent-specific data
for continent, data in continent_data.items():
    continent_filename = continent.replace(' ', '_').lower()
    data.to_csv(f'{output_dir}/continent_{continent_filename}.csv', index=False)

print("="*80)
print("DATASETS SAVED SUCCESSFULLY")
print("="*80)
print(f"\nAll datasets saved to: {output_dir}/")
print("\nSaved files:")
print("  - df_long.csv")
print("  - df_continent_monthly.csv")
print("  - city_monthly_avg.csv")
print("  - df_seasonal_stats.csv")
print("  - df_seasonal_variation.csv")
for continent in sorted(continent_data.keys()):
    continent_filename = continent.replace(' ', '_').lower()
    print(f"  - continent_{continent_filename}.csv")
print("\n" + "="*80)


DATASETS SAVED SUCCESSFULLY

All datasets saved to: Question3/preprocessed_data/

Saved files:
  - df_long.csv
  - df_continent_monthly.csv
  - city_monthly_avg.csv
  - df_seasonal_stats.csv
  - df_seasonal_variation.csv
  - continent_africa.csv
  - continent_asia.csv
  - continent_europe.csv
  - continent_north_america.csv
  - continent_oceania.csv
  - continent_south_america.csv



In [33]:
# Display final data structures for verification
print("Final Data Structures:")
print("\n1. Long Format Data (df_long):")
print(df_long.head(10))
print(f"\nShape: {df_long.shape}")

print("\n\n2. Continent Monthly Averages (df_continent_monthly):")
print(df_continent_monthly.head(10))
print(f"\nShape: {df_continent_monthly.shape}")

print("\n\n3. Seasonal Statistics (df_seasonal_stats):")
print(df_seasonal_stats)

print("\n\n4. Seasonal Variation Metrics (df_seasonal_variation):")
print(df_seasonal_variation)


Final Data Structures:

1. Long Format Data (df_long):
        date            city  tourism_index  year  month  quarter  \
0 2023-12-31       Amsterdam           0.56  2023     12        4   
1 2023-12-31         Bangkok           0.50  2023     12        4   
2 2023-12-31       Barcelona           0.34  2023     12        4   
3 2023-12-31           Cairo           0.24  2023     12        4   
4 2023-12-31        Istanbul           0.31  2023     12        4   
5 2023-12-31          London           0.47  2023     12        4   
6 2023-12-31     Los Angeles           0.33  2023     12        4   
7 2023-12-31        New York           0.33  2023     12        4   
8 2023-12-31           Paris           0.36  2023     12        4   
9 2023-12-31  Rio de Janeiro           0.21  2023     12        4   

   day_of_year  week_of_year      continent  
0          365            52         Europe  
1          365            52           Asia  
2          365            52         Europe  
3