# Transportation CO2 Emission  

In [None]:
!pip install geopy

In [None]:
# Import libraries.
# import os
# import glob
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt
from geopy.distance import geodesic
from geopy.geocoders import Nominatim

## Import data

In [None]:
# Load train data.
ROOTPATH_DATA = r"../datasets"
train_df = pd.read_csv(ROOTPATH_DATA+"/train.csv", sep=",")

In [None]:
# Processing.
columns_to_clean = ['Month 2', 'Month 3', 'Month 4']
train_df[columns_to_clean] = train_df[columns_to_clean].apply(pd.to_numeric, errors='coerce').astype('Int64')

train_df['Country'] = train_df['Country'].str.upper()
train_df['Site'] = train_df['Site'].str.upper()

In [None]:
import pandas as pd

# Load country_code data.
country_code_df = pd.read_csv("../datasets/countries_codes_and_coordinates.csv", 
                              usecols=["Country", "Alpha-2 code", "Latitude (average)", "Longitude (average)"])

# Clean and process data.
country_code_df["Alpha-2 code"] = country_code_df["Alpha-2 code"].str.replace('"', '').str.strip().astype(str)

# Use pd.eval to safely evaluate expressions.
country_code_df['Coordinates'] = list(zip(
    country_code_df['Latitude (average)'].str.strip().apply(pd.eval).astype(float),
    country_code_df['Longitude (average)'].str.strip().apply(pd.eval).astype(float)
))

# Drop unnecessary columns and rename.
country_code_df.drop(columns=["Longitude (average)", "Latitude (average)"], inplace=True)
country_code_df.rename(columns={"Alpha-2 code": "Code", "Country": "Name"}, inplace=True)

country_code_df.head()

In [None]:
# Function to extract country code from Site column
def extract_country_code(site):
    if site.split('_')[0] == 'OOS':
        return 'US' # only exceptional case
    return site.split('_')[0]

# Add a new column 'Site Country' to train_df
train_df['Site Country'] = train_df['Site'].apply(extract_country_code)
train_df['Country'] = train_df['Country'].str.upper()

# Merge train_df with country_code_df to get country names
train_df = pd.merge(train_df, country_code_df, left_on='Site Country', right_on='Code', how='left')

# Drop unnecessary columns and rename columns
train_df = train_df.drop(['Site Country', 'Code'], axis=1)
train_df = train_df.rename(columns={'Name': 'Site Country', 'Coordinates': 'Coordinates Site'})

# Merge train_df with country_code_df to get country names
train_df = pd.merge(train_df, country_code_df, left_on='Country', right_on='Code', how='left')

# Drop unnecessary columns and rename columns
train_df = train_df.drop(['Country', 'Code'], axis=1)
train_df = train_df.rename(columns={'Name': 'Country', 'Coordinates': 'Coordinates Country'})

# Display the final DataFrame
train_df.head(5)

In [None]:
# print(train_df["Country"].sort_values().unique())
# print(train_df["Site Country"].sort_values().unique())
train_df = train_df.replace({"Viet Nam": "Vietnam", 
                  "Taiwan, Province of China": "Taiwan"})
print(train_df["Country"].sort_values().unique())
print(train_df["Site Country"].sort_values().unique())

In [None]:
train_df.head(2)

## Compute distance

In [None]:
# # Define geodesic function to get location (coordinates) and compute distance.


# # Get location function.
# def get_location(country_name, user_agent="hi!ckathon"):
#     geolocator = Nominatim(user_agent=user_agent)
#     location = geolocator.geocode(country_name)
#     return (location.latitude, location.longitude) if location else None


# # Compute distance function.
# # def compute_distance(row, column1, column2):
# #     location1 = row[column1]
# #     location2 = row[column2]
# #     return geodesic(location1, location2).kilometers

In [None]:
# # Compute "Delivery Distance".
# train_df["Destination Coordinates"] = train_df.apply(
#     compute_distance, "Origin Coordinates", "Destination Coordinates"
# )

In [None]:
# country_coordinates_df = pd.read_csv(ROOTPATH_DATA+"/country_coordinates.csv", index_col=0)

## Plot Transportation Lines Graph

In [None]:
# Aggregate sales data by product, site and country
aggregated_df = train_df[projection].groupby(['id_product', 'Site Country', 'Country']).agg({
    'Month 1': 'sum',
    'Month 2': 'sum',
    'Month 3': 'sum'
}).reset_index()

aggregated_df['Sales'] = aggregated_df[['Month 1', 'Month 2', 'Month 3']].sum(axis=1)

# Drop the individual month columns
aggregated_df = aggregated_df.drop(['Month 1', 'Month 2', 'Month 3'], axis=1)

aggregated_df.head(5)

In [None]:
country_coordinates_df = train_df[["Site Country", "Country", "Coordinates Site", "Coordinates Country"]]
country_coordinates_df = coordinates_df.drop_duplicates()
print(coordinates_df.shape)
coordinates_df.head()

In [None]:
aggregated_df = aggregated_df.rename(columns={'Country':'CountryLeft', 'Site Country':'Site CountryLeft'})
aggregated_df.dtypes

In [None]:
# Merge the coordinates with the main dataframe based on the 'Country' column
aggregated_df = pd.merge(aggregated_df, country_coordinates_df, left_on=['Site CountryLeft', 'CountryLeft'], 
                         right_on=['Site Country', 'Country'], how='left')

In [None]:
aggregated_df

In [None]:
# Rename the 'Coordinates' column to 'CoordinatesCou'
aggregated_df.drop(columns={'Coordinates': 'CoordinatesCountry'}, inplace=True)


aggregated_df.head(5)

In [None]:
# nan_rows = aggregated_df[aggregated_df['Country'].isna()]
nan_rows = aggregated_df[aggregated_df['Coordinates Country'] == 'nan']

print(nan_rows)


In [None]:
# Function to calculate distance using Haversine formula
def calculate_distance(row):
    return geodesic((row['Coordinates Country']), (row['Coordinates Site'])).kilometers

# Apply the function to create a new 'distance' column
aggregated_df['distance'] = aggregated_df.apply(calculate_distance, axis=1)
aggregated_df.head(5555)

In [None]:
# aggregated_df['CoordinatesCountry'] = aggregated_df['CoordinatesCountry'].apply(lambda x: eval(x))
# aggregated_df['CoordinatesSiteCountry'] = aggregated_df['CoordinatesSiteCountry'].apply(lambda x: eval(x))

# # Function to calculate distance using Haversine formula
# def calculate_distance(row):
#     return geodesic(row['CoordinatesCountry'], row['CoordinatesSiteCountry']).kilometers

# # Apply the function to create a new 'distance' column
# aggregated_df['distance'] = aggregated_df.apply(calculate_distance, axis=1)
# aggregated_df.head(5)

In [None]:
# DON'T UNCOMMENT, this was used to make the file countries_distance

# save_df = aggregated_df.copy()

# columns_to_drop = ['id_product', 'Sales', 'CoordinatesCountry', 'CoordinatesSiteCountry']

# # Drop the specified columns
# save_df.drop(columns=columns_to_drop, inplace=True)

# columns_to_check_duplicates = ['Site Country', 'Country', 'distance']

# # Keep only unique rows based on the specified columns
# save_df = save_df.drop_duplicates(subset=columns_to_check_duplicates)

# save_df.head(5)
# save_df.to_csv('countries_distance.csv', index=False)


In [None]:
# countries = pd.DataFrame(train_df["Country"].sort_values().unique(), columns=["Country"])
# countries = countries[~countries["Country"].str.contains("Taiwan")]
# countries["Coordinates"] = countries["Country"].apply(get_location)
# countries.to_csv("../datasets/country_coordinates.csv")

[ourworldindata](https://ourworldindata.org/grapher/carbon-footprint-travel-mode)
[ourworldindata](https://ourworldindata.org/grapher/carbon-footprint-travel-mode)

|Transportation Mode| Consumption Emission|
|------|-------------|
|Plane | 175 gCO2e/km|
|Motorbike | 92 gCO2e/km|
|Petrol car | 90 gCO2e/km|
|Diesel car| 90 gCO2e/km|
|Train| 28 gCO2e/km|
|Electric car| 29 gCO2e/km|
|Coach| 22 gCO2e/km|
|Long Haul Truck| 57 gCO2e/km|
|Roll-on/Roll-off Ferry| 52 gCO2e/km|

In [None]:
# Assuming we make use of Plane, Roll-on/Roll-off Ferry, Long Haul Truck, Train, Petrol Car, 
# the average consumption emission is 80.4

vehicle_co2_emission = 80.4
french_co2_emission = 4.46

# Assuming you have a DataFrame named aggregated_df
aggregated_df['CO2 Emission Ratio French'] = aggregated_df.apply(lambda row: (row['distance'] * vehicle_co2_emission / row['Sales'])/french_co2_emission if row['Sales'] != 0 else (row['distance'] * vehicle_co2_emission)/french_co2_emission, axis=1)


In [None]:
aggregated_df.head(5)

In [None]:
# # Find the minimum distance value
# min_distance = aggregated_df['distance'].min()

# # Find the value that is immediately greater than the minimum distance value
# next_value = aggregated_df.loc[aggregated_df['distance'] > min_distance, 'distance'].min()

# # Display the result
# print(f'The value that is immediately greater than the minimum distance is: {next_value}')

In [None]:
aggregated_df[aggregated_df["distance"] > 0]["distance"].min()

In [None]:
# Scatter plot of Sales vs. CO2 Emission
plt.scatter(aggregated_df['Sales'], aggregated_df['CO2 Emission Ratio French'])
plt.title('Sales vs. CO2 Emission')
plt.xlabel('Sales')
plt.ylabel('CO2 Emission Ratio French')
plt.show()

In [None]:
# Bar chart of Sales by Country
sales_by_country = aggregated_df.groupby('Country')['Sales'].sum().sort_values()
sales_by_country.plot(kind='bar', color='skyblue')
plt.title('Sales by Country')
plt.xlabel('Country')
plt.ylabel('Total Sales')
plt.show()

In [None]:
# Bar chart of Sales by Country
stocks_by_country = aggregated_df.groupby('Site Country')['Sales'].sum()
stocks_by_country.plot(kind='bar', color='skyblue')
plt.title('Total Stocks per Country')
plt.xlabel('Country')
plt.ylabel('Total Stocks in Sites')
plt.show()

In [None]:
stocks_by_country.head(5)