# HUFFLEPUFF BUSINESS TRAVEL ANALYSIS

## SOURCES


# BUSINESS PROBLEM #1
- We want our employees to only travel to green countries

## HYPOTHESIS 
- Some countries have lower air travel CO2 emissions per passenger
- In some countries the share of CO2 emmissions created from domestic flights surpasses the ones created from international flights
- Some countries have lower total CO2 emissions total

- Benjamin    : Air travel
- Ricardo     : Energy 
- Anna        : Air pollution 
- Xinly       : Plasctic pollution 
- Jp          : Deforastation 

In [117]:
#TODO remane owid codes to iso codes
#TODO export to sql from python

In [1]:
#imports
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
#!pip install pycountry
import pycountry

### POPULATION CLEANING

In [119]:
#import population data
populations_df = pd.read_csv('sources/population.csv')

In [None]:
# rename columns
col_names = {'Entity': 'country', 'Code': 'code', 'Year': 'year','Population - Sex: all - Age: all - Variant: estimates': 'population'}
populations_df = populations_df.rename(columns=col_names)
populations_df.head()


In [None]:
#imported deforestation by country latest year
populations_df_latest = populations_df.loc[populations_df.groupby('country')['year'].idxmax()]
populations_df_latest.head()

In [None]:
#drop nan values
populations_df_latest = populations_df_latest.dropna()
populations_df_latest.head()


In [123]:
#export clean population csv
populations_df_latest.to_csv('sources/clean/population-clean.csv', index=False)

### DEFORESTATION CLEANING

In [None]:
#import deforestation data
forest_df = pd.read_csv('sources/imported-deforestation.csv')
forest_df = forest_df.rename(columns=col_names)
forest_df.head()

In [None]:
#imported deforestation by country latest year
forest_df_latest = forest_df.loc[forest_df.groupby('country')['year'].idxmax()]
forest_df_latest.head()

In [126]:
#export clean deforestation csv
forest_df_latest.to_csv('sources/clean/imported-deforestation-clean.csv', index=False)

### MAPPING

In [127]:
# Load world shape data from geopandas
world = gpd.read_file('maps/110m_cultural/ne_110m_admin_0_countries.shp')

In [128]:
# Merge with the world GeoDataFrame
merged = world.merge(forest_df_latest, how='left', left_on='ISO_A3', right_on='code')

In [None]:
# Plot the map with filled shapes
fig, ax = plt.subplots(1, 1, figsize=(20, 10))

# Plot the countries with data
merged.plot(column='imported_deforestation', cmap='OrRd', legend=False, ax=ax, missing_kwds={'color': 'lightgrey'})

ax.axis('off')

plt.title('World Map')
plt.show()

### AIR POLLUTION CLEANING

In [130]:
# Load the dataset
file_path = 'https://raw.githubusercontent.com/jipijipi/sql-database/main/sources/AQI%20and%20Lat%20Long%20of%20Countries.csv'
df = pd.read_csv(file_path)

In [131]:
# Convert AQI Value to numeric, coerce errors to handle any non-numeric values
df['AQI Value'] = pd.to_numeric(df['AQI Value'], errors='coerce')

# Group by country and calculate the mean AQI value per country
country_aqi = df.groupby('Country')['AQI Value'].mean().reset_index()

# Min-Max normalization to scale AQI values between 1 and 10
min_aqi = country_aqi['AQI Value'].min()
max_aqi = country_aqi['AQI Value'].max()

country_aqi['Pollution Score'] = 1 + 9 * (country_aqi['AQI Value'] - min_aqi) / (max_aqi - min_aqi)

# Round and convert the Pollution Score to integers
country_aqi['Rounded Pollution Score'] = country_aqi['Pollution Score'].round().astype(int)

In [132]:
# Function to get the 3-letter country code
def get_country_code(country_name):
    try:
        return pycountry.countries.lookup(country_name).alpha_3
    except LookupError:
        return None

# Add a new column with the 3-letter country code
country_aqi['Country Code'] = country_aqi['Country'].apply(get_country_code)

### ENERGY

In [3]:
import numpy as np
import pymysql, os
from sqlalchemy import create_engine
from dotenv import load_dotenv
import pandas as pd

In [None]:
# imports energy file
energy = pd.read_csv("sources/owid-energy-data.csv")

## starts cleaning data

# filter geographics
pattern = r'(?i)^(Asia|Africa|South Am|South and C|Europe|Oceania|America|CIS|Central & South|United States Pac|United States Territor|World|U.S.|Upper-middle|United States Virgin|Middle Africa|Wake|Middle East|Eastern Africa|Western Africa|G7|Lower-middle|Low-income|Latin America|G20|Non-OPEC|Non-OECD|North Ame|High-inc|OPEC|OECD|Central America|Persian G|Central and South America|Australia and New Zealand|ASEAN|EU|Antarctica|Pacific|Caribbean|Ember).*'

# Create a mask for non-country entries
mask = ~energy['country'].str.contains(pattern, case=False, regex=True)

# Filter the DataFrame to keep only country entries
energy= energy[mask].reset_index(drop=True)

In [7]:
## filters by year, removes most almost-blank rows

en2=energy.copy()
en2 = en2[en2["year"]>2000]

In [8]:
# filters by NaN, removes cells with higher than X NaNs
en3=en2.copy()
fen3 = en3.isna().sum(axis=0) > 2000
fen = fen3[fen3]

msk = list(fen.index)
for ms in msk:
    en3.drop(ms, axis = 1, inplace=True)

In [9]:
# transposes table, filters by Y NaN rows
en4=en3.T.copy()

fen2 = en4.isna().sum(axis=0) > 2000
f2 = fen2[fen2]

msk2 = list(f2.index)
for ck in msk2:
    en4.drop(ck, axis = 1, inplace=True)


In [None]:
## scoring system: min(1,10-min((round(%green energy/10 + %biofuel/20 + pop(x10^9)/100,0)), 10))
en5 = en4.T.copy() 

# Remove columns with 'non_parameter' columns
en5.drop(columns=[col for col in en5.columns if '_electricity' in col],inplace=True)
en5.drop(columns=[col for col in en5.columns if 'prod' in col],inplace=True)
en5.drop(columns=[col for col in en5.columns if 'primary' in col],inplace=True)

#display(en5.columns)
en5["greenscore1"]=en5[['hydro_share_elec', 'nuclear_share_elec', 'solar_share_elec', 'wind_share_elec']].sum(axis=1)
en5["greenscore2"]=en5['renewables_share_elec']
en5["greenscore"] = en5[['greenscore1', 'greenscore2']].max(axis=1)
en5["blackscore"]= (int(100) - en5["greenscore"].fillna(0))
en5["score"] = ((en5["greenscore"].fillna(0) / 10) + (en5["biofuel_share_elec"].fillna(0) / 20) + (en5["population"].fillna(0) / 1000000000))
en5["score"] = 10 - en5["score"].round(0).clip(upper=10)


#en5_2 = en5.groupby("country")["score"].mean()

#en5_2


In [None]:
## Loads SQL

load_dotenv()
password = os.getenv("PASSWORD")
bd = "energy"
connection_string = 'mysql+pymysql://root:' + password + '@localhost/'+bd
engine = create_engine(connection_string)
engine

In [None]:
## data prep'ed for SQL

en6=en5.copy()
filename = "energy_clean1"
en6.fillna(0, inplace=True)
en6.to_csv(f"sources/{filename}.csv")
en5.to_sql(f"{filename}", con = engine, if_exists='replace')


In [None]:
import pymysql
from sqlalchemy import Table, Column, Integer, String, MetaData, ForeignKey
from sqlalchemy import inspect
from sqlalchemy.sql import text

## groupby country

statement="SELECT country,iso_code, round(avg(score),0) AS en_score FROM energy_clean1 GROUP BY country, iso_code ORDER BY en_score ASC;"

with engine.connect() as con:
    try:
        # Execute the query to fetch results
        en_score = pd.read_sql(statement, con)

    except Exception as e:
        print(f"An error occurred: {e}")

en_score

In [16]:
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
statement2 = (
            "SELECT " 
                "country, "
                "iso_code, " 
                "ROUND(AVG(score),0) AS en_score, "
                "ROUND(AVG(greenscore),0) AS gr_score, "
                "AVG(biofuel_share_elec) AS avg_biofuel, "
                "AVG(coal_share_elec) AS avg_coal, "
                "AVG(hydro_share_elec) AS avg_hydro, " 
                "AVG(gas_share_elec) AS avg_gas, "
                "AVG(nuclear_share_elec) AS avg_nuclear, "
                "AVG(oil_share_elec) AS avg_oil, "
                "AVG(solar_share_elec) AS avg_solar, "
                "AVG(wind_share_elec) AS avg_wind "
            "FROM " 
                "energy_clean1 "
            "GROUP BY " 
                "country, "
                "iso_code " 
            "ORDER BY "
                "en_score ASC "
            "LIMIT 5"
            ";"
            )

statement3 = (
            "SELECT " 
                "country, "
                "iso_code, " 
                "ROUND(AVG(score),0) AS en_score, "
                "ROUND(AVG(greenscore),0) AS gr_score, "
                "AVG(biofuel_share_elec) AS avg_biofuel, "
                "AVG(coal_share_elec) AS avg_coal, "
                "AVG(hydro_share_elec) AS avg_hydro, " 
                "AVG(gas_share_elec) AS avg_gas, "
                "AVG(nuclear_share_elec) AS avg_nuclear, "
                "AVG(oil_share_elec) AS avg_oil, "
                "AVG(solar_share_elec) AS avg_solar, "
                "AVG(wind_share_elec) AS avg_wind "
            "FROM " 
                "energy_clean1 "
            "GROUP BY " 
                "country, "
                "iso_code "
            "ORDER BY "
                "en_score DESC "
            "LIMIT 5"
            ";"
            )

with engine.connect() as con:
    try:
        # Execute the query to fetch results
        graph1 = pd.read_sql(statement2, con)

    except Exception as e:
        print(f"An error occurred: {e}")
        
    try:
        # Execute the query to fetch results
        graph2 = pd.read_sql(statement3, con)

    except Exception as e:
        print(f"An error occurred: {e}")

graphics = pd.concat([graph1, graph2], ignore_index=True)
graphics

In [None]:
# Melt the DataFrame for easier plotting
df_melted = graphics.melt(id_vars='country', 
                    value_vars=['avg_biofuel', 'avg_coal', 'avg_hydro', 'avg_gas', 
                                'avg_nuclear', 'avg_oil', 'avg_solar', 'avg_wind'],
                    var_name='Energy Source', 
                    value_name='Average Value')


# Create a bar plot
sns.barplot(data=df_melted, x='country', y='Average Value', hue='Energy Source')

# Adding titles and labels
plt.title('Average Energy Shares by Country')
plt.xlabel('Country')
plt.ylabel('Average Value (%)')
plt.xticks(rotation=45)
plt.legend(title='Energy Source')

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
statement4 = (
            "SELECT " 
                "ROUND(AVG(score),0) AS en_score, "
                "AVG(biofuel_share_elec) AS avg_biofuel, "
                "AVG(coal_share_elec) AS avg_coal, "
                "AVG(hydro_share_elec) AS avg_hydro, " 
                "AVG(gas_share_elec) AS avg_gas, "
                "AVG(nuclear_share_elec) AS avg_nuclear, "
                "AVG(oil_share_elec) AS avg_oil, "
                "AVG(solar_share_elec) AS avg_solar, "
                "AVG(wind_share_elec) AS avg_wind "
            "FROM " 
                "energy_clean1 "
            )

with engine.connect() as con:
    try:
        # Execute the query to fetch results
        graph4 = pd.read_sql(statement4, con)

    except Exception as e:
        print(f"An error occurred: {e}")
        
graph4

In [None]:
# Create a new DataFrame for renewable and non-renewable
renewable = {
    'Source': ['Hydro', 'Solar', 'Wind', '"Biofuel"'],
    'Average Value': [graph4['avg_hydro'][0], graph4['avg_solar'][0], 
                      graph4['avg_wind'][0], graph4['avg_biofuel'][0]]
}

non_renewable = {
    'Source': ['Coal', 'Gas', 'Oil', 'Nuclear'],
    'Average Value': [graph4['avg_coal'][0], graph4['avg_gas'][0], 
                      graph4['avg_oil'][0], graph4['avg_nuclear'][0]]
}

# Combine into a single DataFrame
renewable_df = pd.DataFrame(renewable)
non_renewable_df = pd.DataFrame(non_renewable)

# Add a column to indicate renewable or non-renewable
renewable_df['Type'] = 'Renewable'
non_renewable_df['Type'] = 'Non-Renewable'

# Concatenate the two DataFrames
final_df = pd.concat([renewable_df, non_renewable_df], ignore_index=True)

# Create the bar plot
sns.barplot(data=final_df, x='Source', y='Average Value', hue='Type')

# Adding titles and labels
plt.title('Average Energy Shares: Renewable vs Non-Renewable')
plt.xlabel('Energy Source')
plt.ylabel('Average Value (%)')
plt.xticks(rotation=45)

# Show the plot
plt.tight_layout()
plt.legend(title='Energy Type')
plt.show()

In [None]:
# Calculate total averages for renewable and non-renewable
total_renewable = (
    graph4['avg_biofuel'][0] +
    graph4['avg_hydro'][0] +
    graph4['avg_solar'][0] +
    graph4['avg_wind'][0]
)

total_non_renewable = (
    graph4['avg_coal'][0] +
    graph4['avg_gas'][0] +
    graph4['avg_oil'][0] +
    graph4['avg_nuclear'][0]
)

# Create a DataFrame for the pie chart
pie_data = pd.DataFrame({
    'Type': ['Renewable', 'Non-Renewable'],
    'Average Value': [total_renewable, total_non_renewable]
})

# Set the figure size
plt.figure(figsize=(8, 8))

# Create the pie chart
plt.pie(pie_data['Average Value'], labels=pie_data['Type'], autopct='%1.1f%%', startangle=140)

# Adding a title
plt.title('Renewable vs Non-Renewable Energy Averages')

# Show the plot
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

In [None]:
# Function to get the 3-letter country code
def get_country_code(country_name):
    try:
        return pycountry.countries.lookup(country_name).alpha_3
    except LookupError:
        return None

# Add a new column with the 3-letter country code
en_score['code'] = en_score['country'].apply(get_country_code)
en_score

In [25]:
# for manual fixing of codes
en_score.to_csv("checkout.csv")

In [None]:
import geopandas as gpd
import matplotlib.colors as mcolors
import pycountry

en_score2 = pd.read_csv("checkout2.csv")

# Load world shape data from geopandas
world = gpd.read_file('maps/110m_cultural/ne_110m_admin_0_countries.shp')

# Merge with the world GeoDataFrame
m1 = world.merge(en_score2, how='left', left_on='SOV_A3', right_on='code')

# Define a custom colormap without white
colors = ['#1dbf1d', '#e30f0b'] 
custom_cmap = mcolors.LinearSegmentedColormap.from_list("custom_reds", colors)

# Plot the map with filled shapes
fig, ax = plt.subplots(1, 1, figsize=(20, 10))

# Plot the countries with data using the custom colormap
m1.plot(column='en_score', cmap=custom_cmap, legend=False, ax=ax, missing_kwds={'color': 'darkgrey'})

# Create a colorbar
sm = plt.cm.ScalarMappable(cmap=custom_cmap, norm=plt.Normalize(vmin=m1['en_score'].min(), vmax=m1['en_score'].max()))
sm.set_array([])  # Needed for older versions of matplotlib
plt.colorbar(sm, ax=ax, label='Energy Score')

# Set title and remove axis
ax.axis('off')
plt.title('World Map by Green Energy')
plt.show()