### Exercise 1: Basic Exploration
**Develop a function `get_column_names(df)`** that takes a pandas DataFrame as input and returns a list of column names.

In [86]:
import pandas as pd

# Required dataset for all functions (but we just have to run it once)
unesco = pd.read_csv("unesco_heritage_sites.csv")

In [2]:
def get_column_names(df):
    return df.columns.to_list() # this is the pandas command to retrieve the list of columns from a DataFrame

In [3]:
get_column_names(unesco)

['SiteID',
 'Name',
 'Country',
 'Category',
 'YearInscribed',
 'VisitorsPerYear',
 'Area_km2',
 'Threatened']

### Exercise 2: Filtering Data
**Develop a function `filter_by_category(df, category)`** that takes a pandas DataFrame and a category (either `"Cultural"`, `"Natural"`, or ` in our case "Mixed"`) and returns a new DataFrame containing only the rows that match the given category in category column.

In [7]:
def filter_by_category(df, category):
    new_df = df[df['Category'] == category]
    return new_df

In [8]:
filter_by_category(unesco,'Cultural')

Unnamed: 0,SiteID,Name,Country,Category,YearInscribed,VisitorsPerYear,Area_km2,Threatened
0,1,Great Wall of China,China,Cultural,1987,10.0,21196.0,No
1,2,Machu Picchu,Peru,Cultural,1983,1.5,32.0,No
2,3,Pyramids of Giza,Egypt,Cultural,1979,14.0,16.0,No
4,5,Colosseum,Italy,Cultural,1980,7.6,0.02,No
6,7,Stonehenge,UK,Cultural,1986,1.3,0.03,No
7,8,Taj Mahal,India,Cultural,1983,8.0,0.17,No
9,10,Angkor Wat,Cambodia,Cultural,1992,2.6,162.6,No
11,12,Acropolis of Athens,Greece,Cultural,1987,2.0,3.04,No
13,14,Petra,Jordan,Cultural,1985,1.0,264.0,No
14,15,Eiffel Tower,France,Cultural,1991,6.9,0.01,No


### Exercise 3: Counting Rows
**Develop a function `count_sites_per_country(df, country)`** that takes a pandas DataFrame and a country name as input and returns the number of heritage sites located in that country.

In [9]:
def count_sites_per_country(df, country):
    new_df = df[df['Country'] == country]
    return len(new_df)

In [13]:
count_sites_per_country(unesco, 'China')

1

### Exercise 4: Sorting Data
**Develop a function `get_top_visited_sites(df, n)`** that takes a DataFrame and a number `n` as input and returns the top `n` most visited heritage sites sorted in descending order.

In [105]:
def get_top_visited_sites(df, n):
    df_sorted = df.sort_values(by='VisitorsPerYear', ascending=False) # The second argument makes it descending order
    sites_sorted = df_sorted['Name'].tolist()
    top_sites = sites_sorted[0:n]
    return top_sites

In [106]:
print(get_top_visited_sites(unesco, 5))

['Pyramids of Giza', 'Great Wall of China', 'Taj Mahal', 'Colosseum', 'Eiffel Tower']


### Exercise 5: Categorizing Sites by Visitors
**Develop a function `categorize_sites_by_visitors(df)`** that adds a new column `"VisitorCategory"` to the DataFrame based on the number of visitors per year:
- `"High"`: More than 5 million visitors
- `"Medium"`: Between 1 and 5 million visitors
- `"Low"`: Less than 1 million visitors  

The function should return the modified DataFrame.

In [59]:
def categorize_sites_by_visitors(df):
    visitor_category_list = []
    for number in list(df["VisitorsPerYear"]): # iterating over the numbers in this column
        if float(number) > 5.0: # making sure that they are numbers, we then compare it with the 2 values for the assignments
            visitor_category_list.append("High") # and append the category according to the comparison
        elif float(number) >= 1.0:
            visitor_category_list.append("Medium")
        else:
            visitor_category_list.append("Low")
    df["VisitorCategory"] = visitor_category_list # we add the list of the categories in the right order as a new column of the dataframe
    return df

In [None]:
categorize_sites_by_visitors(unesco)

### Exercise 6: Counting Threatened Sites
**Develop a function `count_threatened_sites(df)`** that returns the total number of threatened heritage sites.

In [62]:
def count_threatened_sites(df):
    df_threatened = df[df["Threatened"] == "Yes"]
    return len(df_threatened)

In [63]:
count_threatened_sites(unesco)

2

### Exercise 7: Finding Sites Inscribed Before a Given Year
**Develop a function `sites_before_year(df, year)`** that takes a DataFrame and a year as input, returning a new DataFrame containing only the sites inscribed before that year.

In [64]:
def sites_before_year(df, year):
    new_df = df[df["YearInscribed"] < year]
    return new_df

In [66]:
sites_before_year(unesco, 1980)

Unnamed: 0,SiteID,Name,Country,Category,YearInscribed,VisitorsPerYear,Area_km2,Threatened,VisitorCategory
2,3,Pyramids of Giza,Egypt,Cultural,1979,14.0,16.0,No,High
3,4,Grand Canyon,USA,Natural,1979,6.0,4927.0,No,High
8,9,Gal치pagos Islands,Ecuador,Natural,1978,0.3,8010.0,Yes,Low
10,11,Yellowstone National Park,USA,Natural,1978,4.0,8983.0,No,Medium


### Exercise 8: Adding New Sites from a Dictionary (See dictionary defined in the next cell)
**Develop a function `add_new_sites(df, new_sites_dict)`** that takes a DataFrame and a dictionary containing new heritage sites. The function should:
1. Convert the dictionary into a new DataFrame.
2. Concatenate it with the original DataFrame.
3. Return the updated DataFrame.

In [68]:
# For exercise 8

new_sites_dict = {
    "Name": ["Ancient Ruins of Tikal", "Great Zimbabwe National Monument"],
    "Country": ["Guatemala", "Zimbabwe"],
    "Category": ["Cultural", "Cultural"],
    "Year Inscribed": [1979, 1986],
    "Visitors Per Year": [200000, 50000],
    "Threatened": [False, False],
    "Area (sq km)": [16, 7.2]
}

In [69]:
def add_new_sites(df, dict):
    new_df = pd.DataFrame(dict)
    combined_df = pd.concat([df, new_df])
    return combined_df

In [None]:
add_new_sites(unesco, new_sites_dict)

# Note: There are some differences in the new_sites_dict column names (they include spaces, whereas the unesco ones don't)
# So this is why we get some NaN (Not a Number) values in the new rows, but in general this case should work.

Unnamed: 0,SiteID,Name,Country,Category,YearInscribed,VisitorsPerYear,Area_km2,Threatened,VisitorCategory,Year Inscribed,Visitors Per Year,Area (sq km)
0,1.0,Great Wall of China,China,Cultural,1987.0,10.0,21196.0,No,High,,,
1,2.0,Machu Picchu,Peru,Cultural,1983.0,1.5,32.0,No,Medium,,,
2,3.0,Pyramids of Giza,Egypt,Cultural,1979.0,14.0,16.0,No,High,,,
3,4.0,Grand Canyon,USA,Natural,1979.0,6.0,4927.0,No,High,,,
4,5.0,Colosseum,Italy,Cultural,1980.0,7.6,0.02,No,High,,,
5,6.0,Serengeti National Park,Tanzania,Natural,1981.0,1.5,14763.0,No,Medium,,,
6,7.0,Stonehenge,UK,Cultural,1986.0,1.3,0.03,No,Medium,,,
7,8.0,Taj Mahal,India,Cultural,1983.0,8.0,0.17,No,High,,,
8,9.0,Gal치pagos Islands,Ecuador,Natural,1978.0,0.3,8010.0,Yes,Low,,,
9,10.0,Angkor Wat,Cambodia,Cultural,1992.0,2.6,162.6,No,Medium,,,


### Exercise 9: Finding the Largest Site per Country
**Develop a function `largest_site_per_country(df)`** that returns a new DataFrame containing only the largest heritage site (in terms of area) for each country.

In [133]:
# This question doesn't really make sense here because there is only one site per country in our unesco list.
# If you do want to attempt it, the general solution is a little tricky. You can do it by iterating across rows and keeping a list of
# the current largest sites, but it becomes quite ugly.
# This is a clever solution:

def largest_site_per_country(df):
    sorted_df = df.sort_values('Area_km2')  # Sort all sites by their area (smallest to largest)
    new_df = sorted_df.drop_duplicates('Country', keep='last')  # Then remove all repetitions of the country except the last instance (which is the biggest)
    return new_df

In [None]:
largest_site_per_country(unesco)