## Measuring environmental impact

In [None]:
!pip install geopy

In [None]:
# Import libraries.
import os
import glob
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from geopy.distance import geodesic
from geopy.geocoders import Nominatim

In [None]:
ROOTPATH_DATA = r"../datasets"
main_df = pd.read_csv(ROOTPATH_DATA+"/train-data.csv", sep=";", )

In [None]:
print(main_df.shape)
main_df.head(3)

In [None]:
csv_files = glob.glob(fr"{ROOTPATH_DATA}/extra-dataset/*.csv")
dfs = {
    os.path.splitext(
        os.path.basename(csv_file))[0]: pd.read_csv(csv_file, sep=",")
    for csv_file in csv_files
}
gscpi_df, wbeco_df, lpi_df, wbinfla_df = dfs.values()

In [None]:
print(wbeco_df.shape)
wbeco_df.head(2)

In [None]:
print(lpi_df.shape)
lpi_df.head(2)

In [None]:
wbinfla_df.head(2)

In [None]:
def get_location(country_name):
    geolocator = Nominatim(user_agent="hickathon")
    location = geolocator.geocode(country_name)
    return (location.latitude, location.longitude) if location else None


test_df = pd.DataFrame(wbinfla_df["Country"].unique()[:5], columns=["Country"])
# Apply function to create new columns
test_df["Coordinates"] = test_df["Country"].apply(get_location)
# Sort the copied column in descending order
series = test_df["Coordinates"].sort_index(ascending=False).reset_index()
test_df["Coordinates2"] = series["Coordinates"]

In [None]:
test_df["Coordinates2"]

In [None]:
def compute_distance(row):
    location1 = row["Coordinates"]
    location2 = row["Coordinates2"]
    return geodesic(location1, location2).kilometers

# # Drop rows with missing coordinates
# test_df = test_df.dropna(subset=["Coordinates"])

# Compute distance using vectorized operation
test_df["Distance"] = test_df.apply(compute_distance, axis=1)

test_df

## Supply chain data

In [None]:
# print(main_df.columns)
# main_df["id_product"].unique().shape
main_df2 = main_df.drop_duplicates()

In [None]:
print(main_df2.shape)

In [None]:
country_code_df = pd.read_csv("../datasets/country_codes.txt")

# Task 1: Create a "Site Code" column in main_df2
main_df2['Site Code'] = main_df2.loc[:, 'Site'].str[:2]

# Task 2: Merge main_df2 and country_code_df based on "Code" and 
# "Site Code", and rename the "Name" column
merged_df = pd.merge(main_df2, country_code_df, left_on='Site Code',
                     right_on='Code', how='left')
merged_df.drop(columns=["Code", "Site Code"], inplace=True)
merged_df.rename(columns={'Name': 'Site Country'}, inplace=True)

merged_df = pd.merge(merged_df, country_code_df, left_on='Country',
                     right_on='Code', how='left')
merged_df.drop(columns=["Code", "Country"], inplace=True)
merged_df.rename(columns={'Name': 'Country'}, inplace=True)

In [None]:
# main_df2 = merged_df.copy()
# main_df2

In [None]:
train_df = pd.read_csv(ROOTPATH_DATA+"/train.csv", sep=",")

In [None]:
# train_df["Country"].value_counts().plot(kind="bar")

In [None]:
# train_df["Customer Persona proxy"].value_counts()

In [None]:
# train_df["Product  Line proxy"].value_counts().plot(kind="bar")

In [None]:
train_df["Country"].isna().sum()

In [None]:
mask_singapore = main_df2["Site Code"] == "SG"
main_df2[mask_singapore]["Country"].value_counts()

## Global Supply Chain Pressure Index

In [None]:
gscpi_df['Year-Month'] = pd.to_datetime(gscpi_df['Year-Month'])

print("GSCPI shape:", gscpi_df.shape)

print(f"Missing values\n{gscpi_df.isna().sum()}")

print(gscpi_df.info())

In [None]:
# Generate a list of dates from September 2020 to the most recent period with a step of 4 months
date_list = pd.date_range(start='2020-09', end=gscpi_df['Year-Month'].max(), freq='4MS')

# Plotting
plt.plot(gscpi_df['Year-Month'], gscpi_df['GSCPI'], marker='o', label='GSCPI')

# Add vertical lines at each date in the list
for date in date_list:
    plt.axvline(date, color='r', linestyle='--', linewidth=0.8)

# Formatting
plt.xlabel('Time')
plt.ylabel('GSCPI')
plt.title('GSCPI')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## World-Bank economics data

In [None]:
print("WBEco data shape:", wbeco_df.shape)
print()
print(f"Missing values count\n{wbeco_df.isna().sum()}")

In [None]:
# Filter rows for the "World" country
world_df = wbeco_df[wbeco_df['Country'] == 'World']

# Filter columns with names containing 'growth'
growth_columns = world_df.filter(like='growth')

# Calculate the average growth rate for each year for each indicator
avg_growth_by_year = world_df.groupby('Year')[growth_columns.columns].mean().reset_index()

# Melt the DataFrame to have a single column for the indicator
avg_growth_melted = pd.melt(avg_growth_by_year, id_vars='Year', var_name='Indicator', value_name='Average Growth Rate')

# Plot a grouped bar chart
plt.figure(figsize=(12, 8))
sns.barplot(x='Year', y='Average Growth Rate', hue='Indicator', data=avg_growth_melted)
plt.title('Average Growth Rate Over Time for World')
plt.xlabel('Year')
plt.ylabel('Average Growth Rate')
plt.legend(title='Indicator', loc='upper right')
plt.show()

## World Inflation

In [None]:
# Convert 'Year-Month' to datetime if it's not already
wbinfla_df['Year-Month'] = pd.to_datetime(wbinfla_df['Year-Month'])

# Group by 'Year-Month' and compute the average of 'Energy Price Index' and 'Headline Consumer Price Index'
grouped_df = wbinfla_df.groupby('Year-Month').agg({
    'Energy Price Index': 'mean',
    'Headline Consumer Price Index': 'mean'
}).reset_index()

# Plotting
plt.figure(figsize=(10, 6))

plt.plot(grouped_df['Year-Month'], grouped_df['Energy Price Index'], label='Energy Price Index', marker='o')
plt.plot(grouped_df['Year-Month'], grouped_df['Headline Consumer Price Index'], label='Headline Consumer Price Index', marker='o')

plt.title('Average Energy Price Index and Headline Consumer Price Index over Year-Month')
plt.xlabel('Year-Month')
plt.ylabel('Index')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.tight_layout()

# Show the plot
plt.show()