In [1]:
#importing all of the libraries that will be needed
import pandas as pd 
import numpy as np
import seaborn as sns #visualisation
import matplotlib.pyplot as plt #visualisation
%matplotlib inline 
sns.set(color_codes=True)
import statistics as stats
from IPython.display import display, HTML

In [2]:
# Inserting some images that is related to the assignment topic

# Paths to my image files
image_paths = ['11.jpg', '12.jpg', '13.jpg']

# Generate HTML code to display images side by side
images_html = ''.join([f'<img src="{path}" style="width: 310px; margin: 0px; float: left;" />' for path in image_paths])

# Display HTML code
display(HTML(images_html))

In [3]:
#Read files from the URL into the pandas DataFrame
# make sure to reference where you got the dataswt (CSO - Tourism)
url = "airandtravel1.csv"
df = pd.read_csv(url) 

## Where did Ireland's Tourists Come From? Cleaning and Preperation
This section will aim use data preperation methods such a `.head()` , `.shape`, `describe`, `info.()` etc to clean and sort the data first

In [4]:
# is there an increase in certain countries? Let's experiment 
# which countries spent the most money, are they increasling spending money YoY
# how much money can ireland expect in 2024/2025 : ML

#Using .head() to get a sense for the data, its structure and content
df.head()

Unnamed: 0,STATISTIC Label,Year,Month,Country,Direction,UNIT,VALUE
0,Air and Sea Travel,2010,January,Great Britain,Arrivals,Thousand,361.5
1,Air and Sea Travel,2010,January,Great Britain,Departures,Thousand,407.6
2,Air and Sea Travel,2010,January,Other UK (1),Arrivals,Thousand,0.9
3,Air and Sea Travel,2010,January,Other UK (1),Departures,Thousand,1.1
4,Air and Sea Travel,2010,January,Belgium,Arrivals,Thousand,12.5


In [5]:
df.shape

(6084, 7)

In [6]:
# Count occurrences of "thousand" in the 'UNIT' feature. If 100% of data in this unit is thousand we will drop the column
#Reason for this is it is ir
thousand_count = (df['UNIT'] == 'Thousand').sum()

# Calculating total number of observations
total_rows = len(df)

# Calculate percentage of times out of 100 that 'Thousand' was the unit of measurement 
percentage_thousand = (thousand_count / total_rows) * 100

print("Percentage of 'thousand' in the 'UNIT' column:", percentage_thousand)

# for report - need reference 
#it's perfectly fine to drop a column if it contains the exact same value for all rows in the DataFrame, especially if that value doesn't provide any additional information beyond what's already known or can be communicated through metadata or other means.
#In your case, if the 'UNIT' column contains the same value ('thousands') for all rows and this information is already conveyed to the reader or user of the data, there's no need to keep that column in the DataFrame. You can safely drop it to simplify the structure of your DataFrame and make it more efficient.
#Dropping such redundant columns can improve the clarity and efficiency of your data analysis and visualization processes


Percentage of 'thousand' in the 'UNIT' column: 100.0


In [7]:
# Drop the 'UNIT' column
df.drop(columns=['UNIT'], inplace=True)

In [8]:
# only keep the observations that are arrivals
df_update = df[df['Direction'] == 'Arrivals']

In [9]:
# check to see the head 
df_update.head()

Unnamed: 0,STATISTIC Label,Year,Month,Country,Direction,VALUE
0,Air and Sea Travel,2010,January,Great Britain,Arrivals,361.5
2,Air and Sea Travel,2010,January,Other UK (1),Arrivals,0.9
4,Air and Sea Travel,2010,January,Belgium,Arrivals,12.5
6,Air and Sea Travel,2010,January,Germany,Arrivals,45.3
8,Air and Sea Travel,2010,January,Spain,Arrivals,68.6


In [10]:
#Here i am dropping any data before 2021, as I was to work with the timeframe 2021-2023
df_filtered = df_update[df_update['Year'] >= 2021]

# Reset index 
df_filtered.reset_index(drop=True, inplace=True)

#my df is now called df_filtered

In [11]:
df_filtered.shape

(666, 6)

In [12]:
# This provides me with information such as the datatypes. 
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 666 entries, 0 to 665
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   STATISTIC Label  666 non-null    object 
 1   Year             666 non-null    int64  
 2   Month            666 non-null    object 
 3   Country          666 non-null    object 
 4   Direction        666 non-null    object 
 5   VALUE            664 non-null    float64
dtypes: float64(1), int64(1), object(4)
memory usage: 31.3+ KB


In [13]:
df_filtered.describe()
# because only value and year is an integer type (float) I am only returned with two coulmn of data for my statistical analsis 

Unnamed: 0,Year,VALUE
count,666.0,664.0
mean,2022.054054,132.956627
std,0.868889,322.234994
min,2021.0,0.0
25%,2021.0,11.25
50%,2022.0,34.6
75%,2023.0,91.75
max,2024.0,2270.7


In [14]:
df_filtered.isnull().sum()

STATISTIC Label    0
Year               0
Month              0
Country            0
Direction          0
VALUE              2
dtype: int64

In [15]:
# Checking against some malform data - get reference about malform data
mal = ["n.a", "?", "NA", "n/a", "na", "--"]
df_filtered = pd.read_csv("airandtravel1.csv" , na_values=mal)

In [16]:
df_filtered.isnull().sum()

STATISTIC Label    0
Year               0
Month              0
Country            0
Direction          0
UNIT               0
VALUE              4
dtype: int64

In [17]:
mean_value = df_filtered['VALUE'].mean()

# Fill NaN values with the mean of VALUE - give reference in report as to why this is important
df_filtered['VALUE'].fillna(mean_value, inplace=True)

In [18]:
# Now my dataset is clean and there are no NaN, mal or missing values
df_filtered.isnull().sum()

STATISTIC Label    0
Year               0
Month              0
Country            0
Direction          0
UNIT               0
VALUE              0
dtype: int64

## Where did Ireland's Tourists Come From? Statistical Anaylsis

## Where did Ireland's Tourists Come From? Visualisation

In [None]:
from geopy.geocoders import Nominatim
import time  # Import time module for adding delays

# Assuming filtered_df is your DataFrame
# Initialize the geocoder with a unique user agent
geolocator = Nominatim(user_agent="my_geocoder")

# Function to get latitude and longitude for a country
def get_lat_long(country):
    # Add a delay to respect rate limits
    time.sleep(1)  # Sleep for 1 second between requests
    location = geolocator.geocode(country)
    if location:
        return location.latitude, location.longitude
    else:
        return None, None

# Add latitude and longitude columns to the DataFrame
df_filtered['Latitude'], df_filtered['Longitude'] = zip(*df_filtered['Country'].apply(get_lat_long))


In [None]:
df_filtered.head()

In [None]:
import folium
from folium.plugins import HeatMap


# Create a folium map centered on the mean latitude and longitude
#m = folium.Map(location=[nasa_df['latitude'].mean(), nasa_df['longitude'].mean()], zoom_start=5)
m = folium.Map(location=[0, 0], zoom_start=4)

# Add a heatmap layer using the latitude and longitude data
heat_data = [[row['Latitude'], row['Longitude']] for index, row in df_filtered.iterrows()]
HeatMap(heat_data).add_to(m)

# Save the map to an HTML file
m.save('heatmap.html')

m

#Blue areas signify regions with fewer meteorite landings.
#Red areas signify regions with more meteorite landings.


In [None]:
# other questions: where are they staying , counties, accomodations 
