In [5]:
import pandas as pd
from pathlib import Path

# Auto-detect processed.csv location
candidates = [
    Path("data/processed.csv"),
    Path("../data/processed.csv"),
    Path("../../data/processed.csv"),
]

csv_path = next((p for p in candidates if p.exists()), None)

if csv_path is None:
    raise FileNotFoundError("processed.csv not found. Check project structure.")

df = pd.read_csv(csv_path)

print("Using file:", csv_path.resolve())
print("Total rows:", len(df))

unique_count = df.drop_duplicates(["country", "year", "month"]).shape[0]
print("Unique (country,year,month):", unique_count)

duplicates = df[df.duplicated(["country", "year", "month"], keep=False)]

print("Duplicate rows found:", len(duplicates))

if not duplicates.empty:
    print("\nDuplicate details:")
    print(
        duplicates
        .sort_values(["country", "year", "month"])
        .head(20)
    )
else:
    print("No duplicates detected.")

Using file: C:\Users\dell\OneDrive\Desktop\L4S1\ML & Pattern Recognition\tourism-forecast-ml\data\processed.csv
Total rows: 11572
Unique (country,year,month): 11572
Duplicate rows found: 0
No duplicates detected.


In [6]:
years = sorted(df["year"].unique())

print("Years in dataset:")
print(years)

Years in dataset:
[np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024), np.int64(2025)]


In [7]:
print("\nMonths captured per year:")
for y in sorted(df["year"].unique()):
    months_count = df[df["year"] == y]["month"].nunique()
    print(f"{y} → {months_count} months")


Months captured per year:
2019 → 12 months
2020 → 12 months
2021 → 12 months
2022 → 12 months
2023 → 12 months
2024 → 12 months
2025 → 12 months


In [8]:
month_counts = (
    df.groupby(["country", "year"])["month"]
      .nunique()
      .reset_index(name="months_present")
)

incomplete = month_counts[month_counts["months_present"] != 12]

print("Total country-years:", len(month_counts))
print("Incomplete country-years:", len(incomplete))

incomplete.head(20)

Total country-years: 968
Incomplete country-years: 15


Unnamed: 0,country,year,months_present
22,ANDORRA,2020,11
192,BURUNDI,2019,10
226,CENTRAL AFRICAN,2020,2
231,CHAD,2019,11
266,CONGO,2019,11
296,CUBA,2021,11
324,DJIBOUTI,2020,11
337,DOMINICAN,2020,2
365,EQUATORIAL,2020,2
415,GABON,2020,10


In [9]:
year_summary = (
    df.groupby("year")
      .agg(
          countries=("country", "nunique"),
          rows=("country", "count"),
          months=("month", "nunique")
      )
      .reset_index()
)

print(year_summary)

   year  countries  rows  months
0  2019         69   824      12
1  2020        187  2208      12
2  2021        190  2278      12
3  2022         73   875      12
4  2023        190  2279      12
5  2024        193  2316      12
6  2025         66   792      12


In [10]:
country_per_year = df.groupby("year")["country"].nunique()

print(country_per_year)

year
2019     69
2020    187
2021    190
2022     73
2023    190
2024    193
2025     66
Name: country, dtype: int64


In [12]:
import pandas as pd
from pathlib import Path
import os

def find_processed_csv() -> Path:
    
    candidates = [
        Path("data/processed.csv"),         
        Path("../data/processed.csv"),      
        Path("../../data/processed.csv"),   
    ]
    for p in candidates:
        if p.exists():
            return p
    raise FileNotFoundError("processed.csv not found. Check where you are running from.")

RAW_PROCESSED = find_processed_csv()
OUT_CLEANED = RAW_PROCESSED.parent / "cleaned.csv"

print("Using processed file:", RAW_PROCESSED.resolve())
print("Output cleaned file:", OUT_CLEANED.resolve())

df = pd.read_csv(RAW_PROCESSED)
df.head()

Using processed file: C:\Users\dell\OneDrive\Desktop\L4S1\ML & Pattern Recognition\tourism-forecast-ml\data\processed.csv
Output cleaned file: C:\Users\dell\OneDrive\Desktop\L4S1\ML & Pattern Recognition\tourism-forecast-ml\data\cleaned.csv


Unnamed: 0,date,year,month,country,arrivals
0,2021-01-01,2021,1,A AFGHANISTAN,0.0
1,2021-02-01,2021,2,A AFGHANISTAN,0.0
2,2021-03-01,2021,3,A AFGHANISTAN,1.0
3,2021-04-01,2021,4,A AFGHANISTAN,1.0
4,2021-05-01,2021,5,A AFGHANISTAN,1.0


In [13]:
import os
os.getcwd()

'c:\\Users\\dell\\OneDrive\\Desktop\\L4S1\\ML & Pattern Recognition\\tourism-forecast-ml\\notebooks'

In [14]:
import os
os.chdir("..") 
print(os.getcwd())

c:\Users\dell\OneDrive\Desktop\L4S1\ML & Pattern Recognition\tourism-forecast-ml


In [18]:
from pathlib import Path

# Ensure directory exists before saving
OUT_CLEANED = Path(OUT_CLEANED)
OUT_CLEANED.parent.mkdir(parents=True, exist_ok=True)

df_clean.to_csv(OUT_CLEANED, index=False)

print("Saved:", OUT_CLEANED.resolve())

Saved: C:\Users\dell\OneDrive\Desktop\L4S1\data\cleaned.csv


In [20]:
OUT_CLEANED.parent.mkdir(parents=True, exist_ok=True)

In [21]:
from pathlib import Path

BASE_DIR = Path.cwd()

# Move upward until we find project root
while not (BASE_DIR / "data").exists() and BASE_DIR != BASE_DIR.parent:
    BASE_DIR = BASE_DIR.parent

OUT_CLEANED = BASE_DIR / "data" / "cleaned.csv"
OUT_CLEANED.parent.mkdir(exist_ok=True)

df_clean.to_csv(OUT_CLEANED, index=False)

print("Saved at:", OUT_CLEANED.resolve())

Saved at: C:\Users\dell\OneDrive\Desktop\L4S1\data\cleaned.csv


In [22]:
import pandas as pd
from pathlib import Path

# Auto-detect cleaned.csv
candidates = [
    Path("data/cleaned.csv"),
    Path("../data/cleaned.csv"),
    Path("../../data/cleaned.csv"),
]

csv_path = next((p for p in candidates if p.exists()), None)

if csv_path is None:
    raise FileNotFoundError("cleaned.csv not found.")

print("Using:", csv_path.resolve())

df = pd.read_csv(csv_path)

Using: C:\Users\dell\OneDrive\Desktop\L4S1\data\cleaned.csv


In [23]:
print("Shape:", df.shape)

Shape: (11436, 5)


In [24]:
print("Columns:", df.columns.tolist())

Columns: ['date', 'year', 'month', 'country', 'arrivals']


In [25]:
df.head(10)

Unnamed: 0,date,year,month,country,arrivals
0,2021-01-01,2021,1,A AFGHANISTAN,0.0
1,2021-02-01,2021,2,A AFGHANISTAN,0.0
2,2021-03-01,2021,3,A AFGHANISTAN,1.0
3,2021-04-01,2021,4,A AFGHANISTAN,1.0
4,2021-05-01,2021,5,A AFGHANISTAN,1.0
5,2021-06-01,2021,6,A AFGHANISTAN,1.0
6,2021-07-01,2021,7,A AFGHANISTAN,0.0
7,2021-08-01,2021,8,A AFGHANISTAN,1.0
8,2021-09-01,2021,9,A AFGHANISTAN,4.0
9,2021-10-01,2021,10,A AFGHANISTAN,0.0


In [26]:
df.tail(10)

Unnamed: 0,date,year,month,country,arrivals
11426,2024-03-01,2024,3,ZIMBABWE,25.0
11427,2024-04-01,2024,4,ZIMBABWE,16.0
11428,2024-05-01,2024,5,ZIMBABWE,11.0
11429,2024-06-01,2024,6,ZIMBABWE,14.0
11430,2024-07-01,2024,7,ZIMBABWE,13.0
11431,2024-08-01,2024,8,ZIMBABWE,11.0
11432,2024-09-01,2024,9,ZIMBABWE,3.0
11433,2024-10-01,2024,10,ZIMBABWE,14.0
11434,2024-11-01,2024,11,ZIMBABWE,26.0
11435,2024-12-01,2024,12,ZIMBABWE,22.0


In [27]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 11436 entries, 0 to 11435
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      11436 non-null  str    
 1   year      11436 non-null  int64  
 2   month     11436 non-null  int64  
 3   country   11436 non-null  str    
 4   arrivals  11436 non-null  float64
dtypes: float64(1), int64(2), str(2)
memory usage: 446.8 KB


In [28]:
print("Duplicates:",
      df.duplicated(["country", "year", "month"]).sum())

Duplicates: 0


In [29]:
month_counts = (
    df.groupby(["country", "year"])["month"]
      .nunique()
)

print("Any incomplete:",
      (month_counts != 12).sum())

Any incomplete: 0


In [30]:
print("Total countries:", df["country"].nunique())

Total countries: 241


In [31]:
df.shape
df["country"].nunique()
df["year"].unique()

array([2021, 2019, 2020, 2022, 2023, 2024, 2025])

In [33]:
df_clean.to_csv("../data/cleaned.csv", index=False)
print("Saved successfully.")

Saved successfully.


In [35]:
from pathlib import Path

save_path = Path("../data/cleaned.csv")

save_path.parent.mkdir(parents=True, exist_ok=True)

df_clean.to_csv(save_path, index=False)

print("Saved at:", save_path.resolve())
print("Rows saved:", len(df_clean))

Saved at: C:\Users\dell\OneDrive\Desktop\L4S1\data\cleaned.csv
Rows saved: 11436


In [36]:
print("df_clean shape:", df_clean.shape)
df_clean.head()

df_clean shape: (11436, 5)


Unnamed: 0,date,year,month,country,arrivals
0,2021-01-01,2021,1,A AFGHANISTAN,0.0
1,2021-02-01,2021,2,A AFGHANISTAN,0.0
2,2021-03-01,2021,3,A AFGHANISTAN,1.0
3,2021-04-01,2021,4,A AFGHANISTAN,1.0
4,2021-05-01,2021,5,A AFGHANISTAN,1.0


In [37]:
from pathlib import Path
import pandas as pd

# Force project root
PROJECT_ROOT = Path.cwd().parent
DATA_DIR = PROJECT_ROOT / "data"

cleaned_path = DATA_DIR / "cleaned.csv"

df_clean.to_csv(cleaned_path, index=False)

print("Saved to:", cleaned_path.resolve())
print("Rows saved:", len(df_clean))

Saved to: C:\Users\dell\OneDrive\Desktop\L4S1\data\cleaned.csv
Rows saved: 11436


In [38]:
import os
print("Current working directory:", os.getcwd())

Current working directory: c:\Users\dell\OneDrive\Desktop\L4S1\ML & Pattern Recognition


In [39]:
import os
print("Current working directory:", os.getcwd())

Current working directory: c:\Users\dell\OneDrive\Desktop\L4S1\ML & Pattern Recognition


In [40]:
import pandas as pd

df = pd.read_csv("../data/cleaned.csv")

print("Shape:", df.shape)
df.head()

Shape: (11436, 5)


Unnamed: 0,date,year,month,country,arrivals
0,2021-01-01,2021,1,A AFGHANISTAN,0.0
1,2021-02-01,2021,2,A AFGHANISTAN,0.0
2,2021-03-01,2021,3,A AFGHANISTAN,1.0
3,2021-04-01,2021,4,A AFGHANISTAN,1.0
4,2021-05-01,2021,5,A AFGHANISTAN,1.0


In [41]:
print("Columns:", df.columns.tolist())
print()
df.info()

Columns: ['date', 'year', 'month', 'country', 'arrivals']

<class 'pandas.DataFrame'>
RangeIndex: 11436 entries, 0 to 11435
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      11436 non-null  str    
 1   year      11436 non-null  int64  
 2   month     11436 non-null  int64  
 3   country   11436 non-null  str    
 4   arrivals  11436 non-null  float64
dtypes: float64(1), int64(2), str(2)
memory usage: 446.8 KB


In [42]:
print("Duplicates:",
      df.duplicated(["country", "year", "month"]).sum())

Duplicates: 0


In [43]:
print("Null values:")
print(df.isna().sum())

Null values:
date        0
year        0
month       0
country     0
arrivals    0
dtype: int64


In [44]:
print("Years:", sorted(df["year"].unique()))

Years: [np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024), np.int64(2025)]


In [45]:
df.describe()

Unnamed: 0,year,month,arrivals
count,11436.0,11436.0,11436.0
mean,2022.03043,6.5,509.855894
std,1.789671,3.452203,2433.740302
min,2019.0,1.0,0.0
25%,2020.0,3.75,0.0
50%,2022.0,6.5,3.0
75%,2024.0,9.25,68.0
max,2025.0,12.0,52881.0
