## Pre-processing Phase

# TO use only when we want to derive new data from the actual dataset 

In [2]:
import pandas as pd

# Load the CSV file
dataset_path = "../datasets/co-emissions-per-capita.csv"

df = pd.read_csv(dataset_path, sep=',')  # Change sep to ';' if needed

# Display the first few rows to verify
print(df.head())

num_rows, num_columns = df.shape

print(f"Number of rows: {num_rows}")
print(f"Number of coloumns: {num_columns}")


        Entity Code  Year  Annual CO₂ emissions (per capita)
0  Afghanistan  AFG  1949                           0.001992
1  Afghanistan  AFG  1950                           0.011266
2  Afghanistan  AFG  1951                           0.012098
3  Afghanistan  AFG  1952                           0.011946
4  Afghanistan  AFG  1953                           0.013685
Number of rows: 26600
Number of coloumns: 4


## Example

In [2]:

selection = (df['Entity'] == 'Afghanistan') & (df['Year'] == 1950)
afghanistan_1950_data = df[selection]


print(afghanistan_1950_data.head())


        Entity Code  Year  Annual CO₂ emissions (per capita)
1  Afghanistan  AFG  1950                           0.011266


---------------------

## First Plot - Choose one Year 
- Choose one Year and plot via Bar Chart the Top 10 Entity which use more CO2 emission per capita 


In [3]:
# Choosing the 2012 
selection = (df['Year'] == 2012)
df_year_2012 = df[selection]


print(df_year_2012.head())

          Entity Code  Year  Annual CO₂ emissions (per capita)
63   Afghanistan  AFG  2012                           0.329389
291       Africa  NaN  2012                           1.132187
381      Albania  ALB  2012                           1.676950
488      Algeria  DZA  2012                           3.641230
716      Andorra  AND  2012                           6.860168


-------------

## Choose one decade (average)

- Choose one Decade and plot via Bar Chart the Top 10 Entity which on average use more CO2 emission per capita 

In [15]:
# Create a new coloumn 'Decade' raggrupping the years by decade
df['Decade'] = (df['Year'] // 10) * 10

decade_entity_avg = df.groupby(['Decade', 'Entity'])['Annual CO₂ emissions (per capita)'].mean().reset_index()

decade_entity_avg_sorted = decade_entity_avg.sort_values(by='Annual CO₂ emissions (per capita)', ascending=False)

print(decade_entity_avg_sorted)

      Decade                     Entity  Annual CO₂ emissions (per capita)
1439    1950  Sint Maarten (Dutch part)                         537.207977
1654    1960  Sint Maarten (Dutch part)                         284.038517
1872    1970  Sint Maarten (Dutch part)                         155.856916
1321    1950                    Curacao                          83.080772
1747    1970                    Curacao                          75.321828
...      ...                        ...                                ...
74      1770                    Oceania                           0.000000
75      1770           Papua New Guinea                           0.000000
1189    1940                     Guinea                           0.000000
77      1770                  Singapore                           0.000000
45      1760              North America                           0.000000

[3057 rows x 3 columns]


-------------------

# Data of the European Union member countries selecting only entity, code, year, emission CO2

In [4]:

ue_country = [
    'Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic', 
    'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 
    'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 
    'Malta', 'Netherlands', 'Poland', 'Portugal', 'Romania', 'Slovakia', 
    'Slovenia', 'Spain', 'Sweden'
]

df_ue = df[df['Entity'].isin(ue_country)]

print(df_ue)

        Entity Code  Year  Annual CO₂ emissions (per capita)
1965   Austria  AUT  1807                           0.053946
1966   Austria  AUT  1819                           0.075490
1967   Austria  AUT  1820                           0.099019
1968   Austria  AUT  1821                           0.106077
1969   Austria  AUT  1822                           0.107694
...        ...  ...   ...                                ...
22858   Sweden  SWE  2018                           4.136523
22859   Sweden  SWE  2019                           3.993607
22860   Sweden  SWE  2020                           3.538104
22861   Sweden  SWE  2021                           3.680562
22862   Sweden  SWE  2022                           3.606909

[4179 rows x 4 columns]


In [6]:
df_eu_ord = df_ue.sort_values(by='Annual CO₂ emissions (per capita)', ascending=False)
print(df_eu_ord)


           Entity Code  Year  Annual CO₂ emissions (per capita)
14508  Luxembourg  LUX  1974                          41.047718
14507  Luxembourg  LUX  1973                          40.593910
14504  Luxembourg  LUX  1970                          40.475006
14506  Luxembourg  LUX  1972                          38.988323
14503  Luxembourg  LUX  1969                          38.967620
...           ...  ...   ...                                ...
13855   Lithuania  LTU  1833                           0.000202
13347      Latvia  LVA  1830                           0.000174
13350      Latvia  LVA  1833                           0.000166
13854   Lithuania  LTU  1832                           0.000164
13349      Latvia  LVA  1832                           0.000135

[4179 rows x 4 columns]


In [8]:
df_eu_ord.to_csv("../datasets/co-emissions-per-capita-ue.csv", index=False)


----------------------

# First select Data from EU contry

In [9]:
print(df_eu_ord)

           Entity Code  Year  Annual CO₂ emissions (per capita)
14508  Luxembourg  LUX  1974                          41.047718
14507  Luxembourg  LUX  1973                          40.593910
14504  Luxembourg  LUX  1970                          40.475006
14506  Luxembourg  LUX  1972                          38.988323
14503  Luxembourg  LUX  1969                          38.967620
...           ...  ...   ...                                ...
13855   Lithuania  LTU  1833                           0.000202
13347      Latvia  LVA  1830                           0.000174
13350      Latvia  LVA  1833                           0.000166
13854   Lithuania  LTU  1832                           0.000164
13349      Latvia  LVA  1832                           0.000135

[4179 rows x 4 columns]


# Select the Year 2022

In [10]:
df_ue_2022 = df_ue[df_ue['Year'] == 2022]

# Visualizza i primi risultati per verifica
print(df_ue_2022)


            Entity Code  Year  Annual CO₂ emissions (per capita)
2169       Austria  AUT  2022                           6.878194
3021       Belgium  BEL  2022                           7.687539
4367      Bulgaria  BGR  2022                           6.804453
6062       Croatia  HRV  2022                           4.348515
6290        Cyprus  CYP  2022                           5.616782
6736       Denmark  DNK  2022                           4.940161
7558       Estonia  EST  2022                           7.776280
9151       Finland  FIN  2022                           6.526740
9366        France  FRA  2022                           4.603891
9974       Germany  DEU  2022                           7.983758
10189       Greece  GRC  2022                           5.745106
11454      Hungary  HUN  2022                           4.449911
12148      Ireland  IRL  2022                           7.721119
12404        Italy  ITA  2022                           5.726825
13517       Latvia  LVA  

# Top 5 emitter and "other"

In [12]:
# Order by emission descrescent order
df_ue_2022_sorted = df_ue_2022.sort_values(by="Annual CO₂ emissions (per capita)", ascending=False)

# Select top 5 emitter 
top_5_emitter = df_ue_2022_sorted.head(5)

# Compute the sum of the "other" country in EU 
others_emission = df_ue_2022_sorted.iloc[5:]["Annual CO₂ emissions (per capita)"].sum()

# Create a new row with the sum  
others_row = pd.DataFrame({
    "Entity": ["Others"],
    "Code": ["OT"],
    "Year": [2022],
    "Annual CO₂ emissions (per capita)": [others_emission]
})

# Combine the Top-5 emitter and the other row 
df_top5_others = pd.concat([top_5_emitter, others_row], ignore_index=True)
# Order by emission
df_top5_others_ord = df_top5_others.sort_values(by="Annual CO₂ emissions (per capita)", ascending=False)


print(df_top5_others_ord)


       Entity Code  Year  Annual CO₂ emissions (per capita)
5      Others   OT  2022                         110.348244
0  Luxembourg  LUX  2022                          11.618432
1      Poland  POL  2022                           8.106886
2     Germany  DEU  2022                           7.983758
3     Estonia  EST  2022                           7.776280
4     Ireland  IRL  2022                           7.721119


In [14]:
df_top5_others_ord.to_csv("../datasets/top5-emitter-and-other-2022-eu.csv", index=False)

--------------