# HUFFLEPUFF BUSINESS TRAVEL ANALYSIS

# BUSINESS PROBLEM #1
- We want our employees to only travel to green countries

## HYPOTHESIS 
- Some countries have lower air travel CO2 emissions per passenger
- In some countries the share of CO2 emmissions created from domestic flights surpasses the ones created from international flights
- Some countries have lower total CO2 emissions total

- Benjamin    : Air travel
- Ricardo     : Energy 
- Anna        : Air pollution 
- Xinly       : Plasctic pollution 
- Jp          : Deforastation 

In [1]:
#TODO remane owid codes to iso codes
#TODO export to sql from python

In [2]:
#imports
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import pycountry

In [None]:
#!pip install pycountry

### POPULATION CLEANING

In [3]:
#import population data
populations_df = pd.read_csv('sources/population.csv')

def import_csv(filename):
    return pd.read_csv('sources/' + filename)

In [4]:
schema = {
    "population": {
        "filename": "population.csv",
        "columns": {
            "country": {"originalName": "Entity", "type": "object"},
            "population": {"originalName": "Population - Sex: all - Age: all - Variant: estimates", "type": "int"},
            "code": {"originalName": "Code", "type": "object"},
            "year": {"originalName": "Year", "type": "int"},
            },
    },
    
    "deforestation": {
        "filename": "imported-deforestation.csv",
        "columns": {
            "country": {"originalName": "Entity", "type": "object"},
            "imported_deforestation": {"originalName": "imported_deforestation", "type": "float"},
            "code": {"originalName": "ISO_A3", "type": "object"},
            "year": {"originalName": "Year", "type": "int"},
            },
    },
}

In [5]:
#functions
def rename_columns(df, new_names_dict):
    df = df.rename(columns=new_names)
    return df

def clean_owid_data(schema)-> pd.DataFrame:
    df = import_csv(filename)
    df = rename_columns(df)

In [None]:
# rename columns
col_names = {'Entity': 'country', 'Code': 'code', 'Year': 'year','Population - Sex: all - Age: all - Variant: estimates': 'population'}
populations_df = populations_df.rename(columns=col_names)
populations_df.head()



In [7]:
#population by country latest year
populations_df_latest = populations_df.loc[populations_df.groupby('country')['year'].idxmax()]
populations_df_latest.head()

def get_latest_year_data(df, group_column, year_column = 'year'):
    df_latest = df.loc[df.groupby('country')['year'].idxmax()]
    return df_latest

In [None]:
#imported deforestation by country latest year
populations_df_latest = populations_df.loc[populations_df.groupby('country')['year'].idxmax()]
populations_df_latest.head()

In [9]:
#drop nan values
populations_df_latest = populations_df_latest.dropna()
populations_df_latest.head()

def drop_nan_values(df):
    df = df.dropna()
    return df

In [10]:
#export clean population csv
populations_df_latest.to_csv('sources/clean/population-clean.csv', index=False)

def export_csv(df, file_name):
    df.to_csv(file_name, index=False)

### DEFORESTATION CLEANING

In [None]:
#import deforestation data
forest_df = pd.read_csv('sources/imported-deforestation.csv')
forest_df = forest_df.rename(columns=col_names)
forest_df.head()

In [None]:
#imported deforestation by country latest year
forest_df_latest = forest_df.loc[forest_df.groupby('country')['year'].idxmax()]
forest_df_latest.head()

In [13]:
#export clean deforestation csv
forest_df_latest.to_csv('sources/clean/imported-deforestation-clean.csv', index=False)

### DEFORESTATION ANALYSIS

In [None]:
deforestation_final = forest_df_latest.merge(populations_df_latest, on='code')
deforestation_final['imported_deforestation_per_capita'] = deforestation_final['imported_deforestation'] / deforestation_final['population']

min_deforestation = deforestation_final['imported_deforestation_per_capita'].min()
max_deforestation = deforestation_final['imported_deforestation_per_capita'].max()

deforestation_final['score'] = 1 + 9 * (deforestation_final['imported_deforestation_per_capita'] - min_deforestation) / (max_deforestation - min_deforestation)
deforestation_final['score'] = deforestation_final['score'].round().astype(int)

deforestation_final.sort_values('score', ascending=False).head(25)

### DEFORESTATION MAPPING

In [15]:
# Load world shape data from geopandas
world = gpd.read_file('maps/110m_cultural/ne_110m_admin_0_countries.shp')

In [16]:
# Merge with the world GeoDataFrame
merged = world.merge(deforestation_final, how='left', left_on='ISO_A3', right_on='code')

In [None]:
# Plot the map with filled shapes
fig, ax = plt.subplots(1, 1, figsize=(20, 10))

# Plot the countries with data
merged.plot(column='score', cmap='OrRd', legend=False, ax=ax, missing_kwds={'color': 'lightgrey'})

ax.axis('off')

plt.title('World Map')
plt.show()

### AIR POLLUTION CLEANING

In [18]:
# Load the dataset
file_path = 'https://raw.githubusercontent.com/jipijipi/sql-database/main/sources/AQI%20and%20Lat%20Long%20of%20Countries.csv'
df = pd.read_csv(file_path)

In [19]:
# Convert AQI Value to numeric, coerce errors to handle any non-numeric values
df['AQI Value'] = pd.to_numeric(df['AQI Value'], errors='coerce')

# Group by country and calculate the mean AQI value per country
country_aqi = df.groupby('Country')['AQI Value'].mean().reset_index()

# Min-Max normalization to scale AQI values between 1 and 10
min_aqi = country_aqi['AQI Value'].min()
max_aqi = country_aqi['AQI Value'].max()

country_aqi['Pollution Score'] = 1 + 9 * (country_aqi['AQI Value'] - min_aqi) / (max_aqi - min_aqi)

# Round and convert the Pollution Score to integers
country_aqi['Rounded Pollution Score'] = country_aqi['Pollution Score'].round().astype(int)

In [20]:
#Clean column names
country_aqi.columns = country_aqi.columns.str.lower().str.replace(' ', '_')


In [21]:
# Function to get the 3-letter country code
def get_country_code(country_name):
    try:
        return pycountry.countries.lookup(country_name).alpha_3
    except LookupError:
        return None

# Add a new column with the 3-letter country code
country_aqi['code'] = country_aqi['country'].apply(get_country_code)

### AIR POLLUTION MAP

In [None]:
# Merge with the world GeoDataFrame
merged = world.merge(country_aqi, how='left', left_on='ISO_A3', right_on='code')

# Plot the map of air pollution scores
fig, ax = plt.subplots(1, 1, figsize=(20, 10))

# Plot the countries with data
merged.plot(column='pollution_score', cmap='OrRd', legend='polution_score_rounded', ax=ax, missing_kwds={'color': 'lightgrey'})

ax.axis('off')

plt.title('Air pollution score')
plt.show()

### AIR POLLUTION LOAD IN SQL

In [None]:

# Load environment variables from .env file
load_dotenv()
password = os.getenv("PASSWORD_A")  # Retrieve the password from the .env file
bd = "business_trips"  # Name of the database
connection_string = f'mysql+pymysql://root:{password}@localhost/{bd}'  # Connection string
engine = create_engine(connection_string)  # Create the SQLAlchemy engine

# Read the CSV file into a DataFrame
df = pd.read_csv("/Users/annapisarek/Downloads/AQI_and_Country_Codes_clean.csv")

# Use the to_sql method with if_exists condition
df.to_sql("air_pollution", con=engine, if_exists='replace', index=False)

# Close the connection if you created it explicitly (not needed with SQLAlchemy)
# connection.close()  # Uncomment this if you used a raw connection
