In [1]:
#importing all of the libraries that will be needed
import pandas as pd 
import numpy as np
import seaborn as sns #visualisation
import matplotlib.pyplot as plt #visualisation
%matplotlib inline 
sns.set(color_codes=True)
import statistics as stats
from IPython.display import display, HTML

In [2]:
# Inserting some images that is related to the assignment topic

# Paths to my image files
image_paths = ['11.jpg', '12.jpg', '13.jpg']

# Generate HTML code to display images side by side
images_html = ''.join([f'<img src="{path}" style="width: 310px; margin: 0px; float: left;" />' for path in image_paths])

# Display HTML code
display(HTML(images_html))

In [3]:
#Read files from the URL into the pandas DataFrame
# make sure to reference where you got the dataswt (CSO - Tourism)
url = "airandtravel1.csv"
travel_df = pd.read_csv(url) 

## Where did Ireland's Tourists Come From? Cleaning and Preperation
This section will aim use data preperation methods such a `.head()` , `.shape`, `describe`, `info.()` etc to clean and sort the data first

In [4]:
# is there an increase in certain countries? Let's experiment 
# which countries spent the most money, are they increasling spending money YoY
# how much money can ireland expect in 2024/2025 : ML

#Using .head() to get a sense for the data, its structure and content
travel_df.head()

Unnamed: 0,STATISTIC Label,Year,Month,Country,Direction,UNIT,VALUE
0,Air and Sea Travel,2010,January,Great Britain,Arrivals,Thousand,361.5
1,Air and Sea Travel,2010,January,Great Britain,Departures,Thousand,407.6
2,Air and Sea Travel,2010,January,Other UK (1),Arrivals,Thousand,0.9
3,Air and Sea Travel,2010,January,Other UK (1),Departures,Thousand,1.1
4,Air and Sea Travel,2010,January,Belgium,Arrivals,Thousand,12.5


In [5]:
travel_df.shape

(6084, 7)

In [6]:
#Here i am dropping any data before 2021, as I was to work with the timeframe 2021-2023
filtered_df = travel_df[travel_df['Year'] >= 2021]

# Reset index 
filtered_df.reset_index(drop=True, inplace=True)

#my df is now called filtered_df

In [7]:
filtered_df.shape

(1332, 7)

In [8]:
# This provides me with information such as the datatypes. 
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1332 entries, 0 to 1331
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   STATISTIC Label  1332 non-null   object 
 1   Year             1332 non-null   int64  
 2   Month            1332 non-null   object 
 3   Country          1332 non-null   object 
 4   Direction        1332 non-null   object 
 5   UNIT             1332 non-null   object 
 6   VALUE            1328 non-null   float64
dtypes: float64(1), int64(1), object(5)
memory usage: 73.0+ KB


In [9]:
filtered_df.describe()
# because only value and year is an integer type (float) I am only returned with two coulmn of data for my statistical analsis 

Unnamed: 0,Year,VALUE
count,1332.0,1328.0
mean,2022.054054,132.897816
std,0.868563,321.662704
min,2021.0,0.0
25%,2021.0,11.075
50%,2022.0,34.6
75%,2023.0,91.125
max,2024.0,2270.7


In [10]:
filtered_df.isnull().sum()

STATISTIC Label    0
Year               0
Month              0
Country            0
Direction          0
UNIT               0
VALUE              4
dtype: int64

In [11]:
# Checking against some malform data - get reference about malform data
mal = ["n.a", "?", "NA", "n/a", "na", "--"]
filtered_df = pd.read_csv("airandtravel1.csv" , na_values=mal)

In [12]:
# No malform data recognised as result is the same as above. 
filtered_df.isnull().sum()

STATISTIC Label    0
Year               0
Month              0
Country            0
Direction          0
UNIT               0
VALUE              4
dtype: int64

In [13]:
mean_value = filtered_df['VALUE'].mean()

# Fill NaN values with the mean of VALUE - give reference in report as to why this is important
filtered_df['VALUE'].fillna(mean_value, inplace=True)

In [14]:
# Now my dataset is clean and there are no NaN, mal or missing values
filtered_df.isnull().sum()

STATISTIC Label    0
Year               0
Month              0
Country            0
Direction          0
UNIT               0
VALUE              0
dtype: int64

## Where did Ireland's Tourists Come From? Visualisation

In [16]:
import pandas as pd
from geopy.geocoders import Nominatim
import time  # Import time module for adding delays

# Assuming filtered_df is your DataFrame
# Initialize the geocoder with a unique user agent
geolocator = Nominatim(user_agent="my_geocoder")

# Function to get latitude and longitude for a country
def get_lat_long(country):
    # Add a delay to respect rate limits
    time.sleep(1)  # Sleep for 1 second between requests
    location = geolocator.geocode(country)
    if location:
        return location.latitude, location.longitude
    else:
        return None, None

# Add latitude and longitude columns to the DataFrame
filtered_df['Latitude'], filtered_df['Longitude'] = zip(*filtered_df['Country'].apply(get_lat_long))


In [17]:
filtered_df.head()

Unnamed: 0,STATISTIC Label,Year,Month,Country,Direction,UNIT,VALUE,Latitude,Longitude
0,Air and Sea Travel,2010,January,Great Britain,Arrivals,Thousand,361.5,54.315159,-1.918153
1,Air and Sea Travel,2010,January,Great Britain,Departures,Thousand,407.6,54.315159,-1.918153
2,Air and Sea Travel,2010,January,Other UK (1),Arrivals,Thousand,0.9,,
3,Air and Sea Travel,2010,January,Other UK (1),Departures,Thousand,1.1,,
4,Air and Sea Travel,2010,January,Belgium,Arrivals,Thousand,12.5,50.640281,4.666715


In [None]:
# other questions: where are they staying , counties, accomodations 
