In [1]:
# Packages import
import pandas as pd
import time
import streamlit as st
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import MaxNLocator

In [2]:
import plotly.express as px
import plotly.graph_objects as go   
from plotly.subplots import make_subplots

## DATA COLLECTION ##

### 1.1 Raw Company Data 
***Source:Knoema Environment Data Atlas (free sample loaded from snowflake marketplace)***

In [3]:
#Load data
raw_data = pd.read_csv('data/company.csv')

In [4]:
# Display basic information about the DataFrame
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304 entries, 0 to 303
Data columns (total 36 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   COMPANY NAME                                    304 non-null    object 
 1   REPORTING YEAR                                  304 non-null    int64  
 2   NITROGEN                                        112 non-null    float64
 3   SULPHUR                                         116 non-null    float64
 4   AIR CADMIUM                                     0 non-null      float64
 5   AIR MERCURY                                     0 non-null      float64
 6   ANNUAL REVENUE                                  301 non-null    float64
 7   HEADCOUNT                                       302 non-null    float64
 8   SCOPE 1 EMISSIONS TOTAL                         285 non-null    object 
 9   SCOPE 2 EMISSION TOTAL                     

In [5]:
# Display summary statistics
raw_data.describe()

Unnamed: 0,REPORTING YEAR,NITROGEN,SULPHUR,AIR CADMIUM,AIR MERCURY,ANNUAL REVENUE,HEADCOUNT,SCOPE 2 EMISSION TOTAL,CONSUMPTION TOTAL ENERGY,COAL,...,TOTAL NON HAZARDOUS WASTE GENERATED,WASTE COMPOSTED,RECOVERED HAZARDOUS WASTE,RECOVERED NON HAZARDOUS WASTE,DISPOSED TOTAL WASTE,DISPOSED HAZARDOUS WASTE,WITHDRAWAL TOTAL WATER,RECYCLED WATER OR REUSE WATER OR TREATED WATER,CONSUMPTION TOTAL WATER,WITHDRAWAL TOTAL FRESHWATER
count,304.0,112.0,116.0,0.0,0.0,301.0,302.0,194.0,215.0,29.0,...,145.0,19.0,80.0,93.0,31.0,52.0,81.0,58.0,185.0,64.0
mean,2018.611842,43180.064821,28381.298103,,,148450.840628,252729.9,4345110.0,78791420.0,21337320.0,...,364700.3,4679.705223,56294.858875,347101.5,45223.354839,80587.01,39657690.0,17429700.0,53915210.0,110128000.0
std,1.789405,57252.126219,43435.595689,,,103548.350097,359342.6,6705319.0,309421900.0,42203060.0,...,623617.0,2401.505364,60266.484475,663168.0,52853.559222,204573.5,52276500.0,28779460.0,134432400.0,131216800.0
min,2016.0,0.0,1.0,,,15222.11415,2274.0,2000.0,833.0,0.0,...,272.17,874.0,4.84,0.18272,5000.0,0.5,0.0,0.0,246859.0,0.0
25%,2017.0,221.0,18.75,,,79507.66423,86250.0,622457.0,2337916.0,3708.33,...,25000.0,2856.789297,7231.75,14000.0,9390.0,2589.0,5043000.0,545307.3,3509332.0,10910000.0
50%,2019.0,1296.0,458.5,,,116627.8245,148768.5,2010000.0,9570000.0,37083.4,...,102961.0,4858.09,44025.5,72728.0,14700.0,15450.0,14010000.0,2250000.0,10150000.0,52339000.0
75%,2020.0,100725.0,47250.0,,,197683.9559,298676.0,5267432.0,24143330.0,833333.0,...,335080.0,5901.5,86000.0,240000.0,54296.0,80225.0,51710000.0,16476500.0,44000000.0,184625000.0
max,2022.0,200190.0,204000.0,,,572754.0,2300000.0,41910000.0,4166667000.0,108017800.0,...,3048510.0,10104.0,300000.0,2982922.0,185650.0,1396000.0,224190000.0,103540000.0,800050000.0,470000000.0


In [6]:
# View the first few rows of the DataFrame
raw_data.head()

Unnamed: 0,COMPANY NAME,REPORTING YEAR,NITROGEN,SULPHUR,AIR CADMIUM,AIR MERCURY,ANNUAL REVENUE,HEADCOUNT,SCOPE 1 EMISSIONS TOTAL,SCOPE 2 EMISSION TOTAL,...,TOTAL NON HAZARDOUS WASTE GENERATED,WASTE COMPOSTED,RECOVERED HAZARDOUS WASTE,RECOVERED NON HAZARDOUS WASTE,DISPOSED TOTAL WASTE,DISPOSED HAZARDOUS WASTE,WITHDRAWAL TOTAL WATER,RECYCLED WATER OR REUSE WATER OR TREATED WATER,CONSUMPTION TOTAL WATER,WITHDRAWAL TOTAL FRESHWATER
0,Samsung Electronics Co,2016,635.0,196.0,,,174211.0009,308745.0,2554000,9046000.0,...,728905.0,,,,,,,48602000.0,104253000.0,
1,Samsung Electronics Co,2017,685.0,69.0,,,212024.2078,320671.0,3668000,9917000.0,...,760463.0,,,,,,,56154000.0,120618000.0,
2,Samsung Electronics Co,2018,616.0,55.0,,,221588.2162,309630.0,4855000,10318000.0,...,813831.0,,,,,,,62371000.0,134230000.0,
3,Samsung Electronics Co,2019,661.0,13.0,,,197683.9559,287439.0,5067000,8733000.0,...,777570.0,,,,,,,68555000.0,134479000.0,
4,Samsung Electronics Co,2020,652.0,6.0,,,200606.179,267937.0,5726000,9079000.0,...,835875.0,,,,,,,70181000.0,142294000.0,


Raw dataset consists mostly of null values which so we will have to drop most of the columns and then look for the data elsewhere

In [7]:
#drop the columns where more than 2/3 of data is unavailable 
data_cleaned = raw_data.dropna(thresh=100, axis=1)
data_cleaned

Unnamed: 0,COMPANY NAME,REPORTING YEAR,NITROGEN,SULPHUR,ANNUAL REVENUE,HEADCOUNT,SCOPE 1 EMISSIONS TOTAL,SCOPE 2 EMISSION TOTAL,CONSUMPTION TOTAL ENERGY,NATURAL GAS,TOTAL WATER DISCHARGE OR WASTEWATER GENERATION,TOTAL WASTE GENERATED,TOTAL HAZARDOUS WASTE GENERATED,TOTAL NON HAZARDOUS WASTE GENERATED,CONSUMPTION TOTAL WATER
0,Samsung Electronics Co,2016,635.0,196.0,174211.0009,308745.0,2554000,9046000.0,21073000.0,,81716000.0,1078310.0,349404.0,728905.0,104253000.0
1,Samsung Electronics Co,2017,685.0,69.0,212024.2078,320671.0,3668000,9917000.0,23419000.0,,95919000.0,1146810.0,386349.0,760463.0,120618000.0
2,Samsung Electronics Co,2018,616.0,55.0,221588.2162,309630.0,4855000,10318000.0,26028000.0,,107699000.0,1210520.0,396690.0,813831.0,134230000.0
3,Samsung Electronics Co,2019,661.0,13.0,197683.9559,287439.0,5067000,8733000.0,26899000.0,,108460000.0,1099197.0,321627.0,777570.0,134479000.0
4,Samsung Electronics Co,2020,652.0,6.0,200606.1790,267937.0,5726000,9079000.0,29024000.0,,109201000.0,1181741.0,345866.0,835875.0,142294000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,Shell Plc,2017,107000.0,81000.0,305179.0000,86000.0,73000000,,269000000.0,,,2020000.0,638000.0,1382000.0,154000000.0
300,Shell Plc,2018,111000.0,74000.0,388379.0000,82000.0,71000000,,268000000.0,,,1999000.0,592000.0,1407000.0,147000000.0
301,Shell Plc,2019,108000.0,65000.0,344877.0000,87000.0,70000000,,264000000.0,,,2113000.0,698000.0,1414000.0,145000000.0
302,Shell Plc,2020,118000.0,36000.0,180543.0000,87000.0,63000000,,241000000.0,,,2049000.0,558000.0,1491000.0,127000000.0


In [8]:
data_cleaned.isna().sum()

COMPANY NAME                                        0
REPORTING YEAR                                      0
NITROGEN                                          192
SULPHUR                                           188
ANNUAL REVENUE                                      3
HEADCOUNT                                           2
SCOPE 1 EMISSIONS TOTAL                            19
SCOPE 2 EMISSION TOTAL                            110
CONSUMPTION TOTAL ENERGY                           89
NATURAL GAS                                       180
TOTAL WATER DISCHARGE OR WASTEWATER GENERATION    201
TOTAL WASTE GENERATED                             192
TOTAL HAZARDOUS WASTE GENERATED                   169
TOTAL NON HAZARDOUS WASTE GENERATED               159
CONSUMPTION TOTAL WATER                           119
dtype: int64

### 1.2 Add manually researched data

Since we are trying to predict company's Scope 1 and Scope 2 emissions, we cannot have any null values in these columns. Since we do not have many rows to begin with, we cannot drop many compamnies. Below is the csv with the emissions values which were researched and inputed manuallty

In [9]:
#impute missing emissions, total energy consumption and total waste with data found manually (added and removed some companies due to data availability)
manual_data = pd.read_csv('data/manual_input.csv', index_col = None)
manual_data

Unnamed: 0,Year,COMPANY_NAME,Country,NUMBER_OF_EMPLOYEES,SCOPE_1,SCOPE_2,SCOPE 1+2,SCOPE 3,ENERGY_CONSUMPTION_TOTAL,TOTAL_WASTE_GENERATED,ISIC Division
0,2016,ABBVIE INC,United States,28939,305000.0,305000.0,610000.0,Not reported,2085000.0,35700.0,Human health and social work activities
1,2017,ABBVIE INC,United States,29777,299000.0,284000.0,584000.0,Not reported,2051000.0,32800.0,Human health and social work activities
2,2018,ABBVIE INC,United States,30612,315000.0,271000.0,585000.0,Not reported,2146000.0,32000.0,Human health and social work activities
3,2019,ABBVIE INC,United States,30776,315000.0,250000.0,564000.0,Not reported,2112000.0,33400.0,Human health and social work activities
4,2020,ABBVIE INC,United States,47000,265000.0,243000.0,508000.0,Not reported,1978000.0,30600.0,Human health and social work activities
...,...,...,...,...,...,...,...,...,...,...,...
305,2017,HP,United States,49000,165138.0,641983.0,807121.0,Reported,807122.0,34910.0,Information and communication
306,2018,HP,United States,55000,164075.0,594823.0,758898.0,Not reported,758898.0,38900.0,Information and communication
307,2019,HP,United States,56000,61900.0,153900.0,215800.0,Reported,663374.0,17466.0,Information and communication
308,2020,HP,United States,53000,50600.0,120400.0,171000.0,Reported,604901.0,20260.0,Information and communication


In [10]:
# Merge data frames
merged_data = pd.merge(data_cleaned,manual_data,
                       right_on=['COMPANY_NAME', 'Year'],
                       left_on=['COMPANY NAME', 'REPORTING YEAR'],
                       how='outer')

In [11]:
merged_data.columns

Index(['COMPANY NAME', 'REPORTING YEAR', 'NITROGEN', 'SULPHUR',
       'ANNUAL REVENUE', 'HEADCOUNT', 'SCOPE 1 EMISSIONS TOTAL',
       'SCOPE 2 EMISSION TOTAL', 'CONSUMPTION TOTAL ENERGY', 'NATURAL GAS',
       'TOTAL WATER DISCHARGE OR WASTEWATER GENERATION',
       'TOTAL WASTE GENERATED', 'TOTAL HAZARDOUS WASTE GENERATED',
       'TOTAL NON HAZARDOUS WASTE GENERATED', 'CONSUMPTION TOTAL WATER',
       'Year', 'COMPANY_NAME', 'Country', 'NUMBER_OF_EMPLOYEES', 'SCOPE_1',
       'SCOPE_2', 'SCOPE 1+2', 'SCOPE 3', 'ENERGY_CONSUMPTION_TOTAL',
       'TOTAL_WASTE_GENERATED', 'ISIC Division'],
      dtype='object')

In [12]:
# Fill the data in duplicated columns

# Fill NaN values in COMPANY_NAME and Year columns with values from respective tables
merged_data['COMPANY NAME'] = merged_data['COMPANY NAME'].combine_first(merged_data['COMPANY_NAME'])
merged_data['REPORTING YEAR'] = merged_data['REPORTING YEAR'].combine_first(merged_data['Year'])

# Fill NaN values for Scopes
merged_data['SCOPE 1 EMISSIONS TOTAL'] =  merged_data['SCOPE_1'].combine_first(merged_data['SCOPE 1 EMISSIONS TOTAL'])
merged_data['SCOPE 2 EMISSION TOTAL'] = merged_data['SCOPE_2'].combine_first(merged_data['SCOPE 2 EMISSION TOTAL'])

# Fill NaN values for Energy and Water consumptions 
merged_data['CONSUMPTION TOTAL ENERGY'] =  merged_data['ENERGY_CONSUMPTION_TOTAL'].combine_first(merged_data['CONSUMPTION TOTAL ENERGY'])

# Fill NaN values for Headcount
merged_data['HEADCOUNT'] =  merged_data['NUMBER_OF_EMPLOYEES'].combine_first(merged_data['HEADCOUNT'])



In [13]:
# Drop duplicate columns
merged_data = merged_data.drop(['COMPANY_NAME', 'Year','SCOPE_1', 'SCOPE_2', 'SCOPE 1+2','ENERGY_CONSUMPTION_TOTAL', 'NUMBER_OF_EMPLOYEES'], axis=1)
merged_data

Unnamed: 0,COMPANY NAME,REPORTING YEAR,NITROGEN,SULPHUR,ANNUAL REVENUE,HEADCOUNT,SCOPE 1 EMISSIONS TOTAL,SCOPE 2 EMISSION TOTAL,CONSUMPTION TOTAL ENERGY,NATURAL GAS,TOTAL WATER DISCHARGE OR WASTEWATER GENERATION,TOTAL WASTE GENERATED,TOTAL HAZARDOUS WASTE GENERATED,TOTAL NON HAZARDOUS WASTE GENERATED,CONSUMPTION TOTAL WATER,Country,SCOPE 3,TOTAL_WASTE_GENERATED,ISIC Division
0,Samsung Electronics Co,2016.0,635.0,196.0,174211.0009,277000.0,2554000.0,9556000.0,21073000.0,,81716000.0,1078310.0,349404.0,728905.0,104253000.0,South Korea,Not reported,1078310.0,Manufacturing
1,Samsung Electronics Co,2017.0,685.0,69.0,212024.2078,282000.0,3704500.0,9405500.0,23419000.0,,95919000.0,1146810.0,386349.0,760463.0,120618000.0,South Korea,Not reported,1146810.0,Manufacturing
2,Samsung Electronics Co,2018.0,616.0,55.0,221588.2162,287000.0,4855000.0,11125000.0,26028000.0,,107699000.0,1210520.0,396690.0,813831.0,134230000.0,South Korea,Not reported,1210520.0,Manufacturing
3,Samsung Electronics Co,2019.0,661.0,13.0,197683.9559,292000.0,5067000.0,10933000.0,26899000.0,,108460000.0,1099197.0,321627.0,777570.0,134479000.0,South Korea,Not reported,1099197.0,Manufacturing
4,Samsung Electronics Co,2020.0,652.0,6.0,200606.1790,297000.0,5726000.0,11504000.0,29024000.0,,109201000.0,1181741.0,345866.0,835875.0,142294000.0,South Korea,Not reported,1181741.0,Manufacturing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336,HP,2017.0,,,,49000.0,165138.0,641983.0,807122.0,,,,,,,United States,Reported,34910.0,Information and communication
337,HP,2018.0,,,,55000.0,164075.0,594823.0,758898.0,,,,,,,United States,Not reported,38900.0,Information and communication
338,HP,2019.0,,,,56000.0,61900.0,153900.0,663374.0,,,,,,,United States,Reported,17466.0,Information and communication
339,HP,2020.0,,,,53000.0,50600.0,120400.0,604901.0,,,,,,,United States,Reported,20260.0,Information and communication


In [14]:
#check the state of new table
merged_data.isna().sum()

COMPANY NAME                                        0
REPORTING YEAR                                      0
NITROGEN                                          229
SULPHUR                                           225
ANNUAL REVENUE                                     40
HEADCOUNT                                           2
SCOPE 1 EMISSIONS TOTAL                            19
SCOPE 2 EMISSION TOTAL                             22
CONSUMPTION TOTAL ENERGY                           88
NATURAL GAS                                       217
TOTAL WATER DISCHARGE OR WASTEWATER GENERATION    238
TOTAL WASTE GENERATED                             229
TOTAL HAZARDOUS WASTE GENERATED                   206
TOTAL NON HAZARDOUS WASTE GENERATED               196
CONSUMPTION TOTAL WATER                           156
Country                                            31
SCOPE 3                                            31
TOTAL_WASTE_GENERATED                             148
ISIC Division               

In [15]:
#drop rows where Scope 1 and Scope 2 were not found in manual input 
comapny_df = merged_data.dropna(subset = ['SCOPE 1 EMISSIONS TOTAL', 'SCOPE 2 EMISSION TOTAL'])

In [16]:
#drop the columns where more than half of rows are unknown
company_clean_df = comapny_df.dropna(thresh=150, axis=1)


In [17]:
#save the resulting table to csv 
company_clean_df.to_csv('company_alldata.csv')

In [18]:
company_clean_df

Unnamed: 0,COMPANY NAME,REPORTING YEAR,ANNUAL REVENUE,HEADCOUNT,SCOPE 1 EMISSIONS TOTAL,SCOPE 2 EMISSION TOTAL,CONSUMPTION TOTAL ENERGY,CONSUMPTION TOTAL WATER,Country,SCOPE 3,TOTAL_WASTE_GENERATED,ISIC Division
0,Samsung Electronics Co,2016.0,174211.0009,277000.0,2554000.0,9556000.0,21073000.0,104253000.0,South Korea,Not reported,1078310.0,Manufacturing
1,Samsung Electronics Co,2017.0,212024.2078,282000.0,3704500.0,9405500.0,23419000.0,120618000.0,South Korea,Not reported,1146810.0,Manufacturing
2,Samsung Electronics Co,2018.0,221588.2162,287000.0,4855000.0,11125000.0,26028000.0,134230000.0,South Korea,Not reported,1210520.0,Manufacturing
3,Samsung Electronics Co,2019.0,197683.9559,292000.0,5067000.0,10933000.0,26899000.0,134479000.0,South Korea,Not reported,1099197.0,Manufacturing
4,Samsung Electronics Co,2020.0,200606.1790,297000.0,5726000.0,11504000.0,29024000.0,142294000.0,South Korea,Not reported,1181741.0,Manufacturing
...,...,...,...,...,...,...,...,...,...,...,...,...
336,HP,2017.0,,49000.0,165138.0,641983.0,807122.0,,United States,Reported,34910.0,Information and communication
337,HP,2018.0,,55000.0,164075.0,594823.0,758898.0,,United States,Not reported,38900.0,Information and communication
338,HP,2019.0,,56000.0,61900.0,153900.0,663374.0,,United States,Reported,17466.0,Information and communication
339,HP,2020.0,,53000.0,50600.0,120400.0,604901.0,,United States,Reported,20260.0,Information and communication


### 2.1 Eurostat Data on country level emissions

In [19]:
world_df = pd.read_csv('data/AIR_GHG_16062023122534338.csv')

In [20]:
emissions_country

Unnamed: 0,COU,Country,POL,Pollutant,VAR,Variable,YEA,Year,Unit Code,Unit,PowerCode Code,PowerCode,Reference Period Code,Reference Period,Value,Flag Codes,Flags
0,AUS,Australia,GHG,Greenhouse gases,TOTAL,Total emissions excluding LULUCF,2016,2016,T_CO2_EQVT,Tonnes of CO2 equivalent,3,Thousands,,,552354.829,,
1,AUS,Australia,GHG,Greenhouse gases,TOTAL,Total emissions excluding LULUCF,2017,2017,T_CO2_EQVT,Tonnes of CO2 equivalent,3,Thousands,,,559581.111,,
2,AUS,Australia,GHG,Greenhouse gases,TOTAL,Total emissions excluding LULUCF,2018,2018,T_CO2_EQVT,Tonnes of CO2 equivalent,3,Thousands,,,560827.414,,
3,AUS,Australia,GHG,Greenhouse gases,TOTAL,Total emissions excluding LULUCF,2019,2019,T_CO2_EQVT,Tonnes of CO2 equivalent,3,Thousands,,,555244.931,,
4,AUS,Australia,GHG,Greenhouse gases,TOTAL,Total emissions excluding LULUCF,2020,2020,T_CO2_EQVT,Tonnes of CO2 equivalent,3,Thousands,,,536739.718,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13713,MLT,Malta,GHG,Greenhouse gases,GHG_GDP_LULU,Total GHG incl. LULUCF per unit of GDP,2017,2017,,,0,Units,,,0.106,,
13714,MLT,Malta,GHG,Greenhouse gases,GHG_GDP_LULU,Total GHG incl. LULUCF per unit of GDP,2018,2018,,,0,Units,,,0.100,,
13715,MLT,Malta,GHG,Greenhouse gases,GHG_GDP_LULU,Total GHG incl. LULUCF per unit of GDP,2019,2019,,,0,Units,,,0.100,,
13716,MLT,Malta,GHG,Greenhouse gases,GHG_GDP_LULU,Total GHG incl. LULUCF per unit of GDP,2020,2020,,,0,Units,,,0.108,,


In [21]:
emissions_country.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13718 entries, 0 to 13717
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   COU                    13718 non-null  object 
 1   Country                13718 non-null  object 
 2   POL                    13718 non-null  object 
 3   Pollutant              13718 non-null  object 
 4   VAR                    13718 non-null  object 
 5   Variable               13718 non-null  object 
 6   YEA                    13718 non-null  int64  
 7   Year                   13718 non-null  int64  
 8   Unit Code              13128 non-null  object 
 9   Unit                   13128 non-null  object 
 10  PowerCode Code         13718 non-null  int64  
 11  PowerCode              13718 non-null  object 
 12  Reference Period Code  0 non-null      float64
 13  Reference Period       0 non-null      float64
 14  Value                  13718 non-null  float64
 15  Fl

In [22]:
# Pivot the DataFrame
pivot_emissions = emissions_country.pivot_table(index=['COU', 'Country', 'Year'],
                          columns='Variable',
                          values='Value',
                          aggfunc='first').reset_index()

In [23]:
pivot_emissions.columns

Index(['COU', 'Country', 'Year', '1 - Energy', '1A1 - Energy Industries',
       '1A2 - Manufacturing industries and construction', '1A3 - Transport',
       '1A4 - Residential and other sectors',
       '1A4 - Residential and other sectors\t', '1A5 - Energy - Other',
       '1B - Fugitive Emissions from Fuels',
       '1C - CO2 from Transport and Storage',
       '2- Industrial processes and product use', '3 - Agriculture',
       '5 - Waste', '6 - Other',
       'Land use, land-use change and forestry (LULUCF)',
       'Total  emissions excluding LULUCF',
       'Total  emissions including LULUCF',
       'Total GHG excl. LULUCF per capita',
       'Total GHG excl. LULUCF per unit of GDP',
       'Total GHG excl. LULUCF, Index 1990=100',
       'Total GHG excl. LULUCF, Index 2000=100',
       'Total GHG incl. LULUCF per capita',
       'Total GHG incl. LULUCF per unit of GDP'],
      dtype='object', name='Variable')

In [24]:
pivot_emissions['Country'].unique()

array(['Australia', 'Austria', 'Belgium', 'Bulgaria', 'Belarus', 'Brazil',
       'Canada', 'Switzerland', 'Chile', 'Colombia', 'Costa Rica',
       'Cyprus', 'Czech Republic', 'Germany', 'Denmark', 'Spain',
       'Estonia', 'European Union – 27 countries (from 01/02/2020)',
       'Finland', 'France', 'United Kingdom', 'Greece', 'Croatia',
       'Hungary', 'Indonesia', 'India', 'Ireland', 'Iceland', 'Israel',
       'Italy', 'Japan', 'Kazakhstan', 'Korea', 'Liechtenstein',
       'Lithuania', 'Luxembourg', 'Latvia', 'Monaco', 'Mexico', 'Malta',
       'Netherlands', 'Norway', 'New Zealand', 'OECD - Total',
       'OECD America', 'OECD Asia Oceania', 'OECD - Europe', 'Peru',
       'Poland', 'Portugal', 'Romania', 'Russia', 'Slovak Republic',
       'Slovenia', 'Sweden', 'Türkiye', 'Ukraine', 'United States'],
      dtype=object)

In [25]:
company_clean_df['Country'].unique()

array(['South Korea', nan, 'Saudi Arabia', 'Japan', 'Taiwan', 'China',
       'United States', 'Germany', 'France', 'Switzerland', 'UK',
       'Russia', 'India', 'United Kingdom', 'Italy'], dtype=object)

In [26]:
#map the countries to avoid mismatch where naming is slightly different 

# country_mapping dictionary
country_mapping = {
    'South Korea': 'Korea',
    'Saudi Arabia': 'Saudi Arabia',
    'Japan': 'Japan',
    'Taiwan': 'Taiwan',
    'China': 'China',
    'United States': 'United States',
    'Germany': 'Germany',
    'France': 'France',
    'Switzerland': 'Switzerland',
    'UK': 'United Kingdom',
    'Russia': 'Russia',
    'India': 'India',
    'United Kingdom': 'United Kingdom',
    'Italy': 'Italy'
}

# Replace values in the 'Country' column of company_clean_df
company_clean_df['Country'] = company_clean_df['Country'].replace(country_mapping)

company_clean_df['Country'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  company_clean_df['Country'] = company_clean_df['Country'].replace(country_mapping)


array(['Korea', nan, 'Saudi Arabia', 'Japan', 'Taiwan', 'China',
       'United States', 'Germany', 'France', 'Switzerland',
       'United Kingdom', 'Russia', 'India', 'Italy'], dtype=object)

In [27]:
pivot_emissions

Variable,COU,Country,Year,1 - Energy,1A1 - Energy Industries,1A2 - Manufacturing industries and construction,1A3 - Transport,1A4 - Residential and other sectors,1A4 - Residential and other sectors\t,1A5 - Energy - Other,...,6 - Other,"Land use, land-use change and forestry (LULUCF)",Total emissions excluding LULUCF,Total emissions including LULUCF,Total GHG excl. LULUCF per capita,Total GHG excl. LULUCF per unit of GDP,"Total GHG excl. LULUCF, Index 1990=100","Total GHG excl. LULUCF, Index 2000=100",Total GHG incl. LULUCF per capita,Total GHG incl. LULUCF per unit of GDP
0,AUS,Australia,2016,430847.154,219696.632,41457.597,96247.370,22803.514,4.128,1107.535,...,,-39871.822,552354.829,512483.007,22.833,0.480,126.092,87.235,21.185,0.445
1,AUS,Australia,2017,433232.808,218630.996,40684.881,97873.732,23572.811,4.213,923.790,...,,-49771.570,559581.111,509809.541,22.745,0.473,127.742,88.843,20.722,0.431
2,AUS,Australia,2018,435570.379,214859.149,41667.747,100147.870,23452.846,4.182,928.399,...,,-46601.058,560827.414,514226.357,22.449,0.464,128.026,89.258,20.583,0.425
3,AUS,Australia,2019,434362.688,213954.254,41604.536,100204.512,21961.994,3.955,791.730,...,,-49387.874,555244.931,505857.057,21.890,0.459,126.752,84.409,19.943,0.419
4,AUS,Australia,2020,418708.570,207918.766,41705.481,93177.677,21377.314,3.983,947.097,...,,-42506.757,536739.718,494232.961,20.890,0.434,122.527,83.014,19.236,0.400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,USA,United States,2017,5418740.173,1907565.706,640424.560,1783274.957,532266.388,8.112,170722.800,...,,-774196.008,6561824.440,5787628.432,20.183,0.347,101.148,94.210,17.801,0.306
305,USA,United States,2018,5589457.285,1932518.915,661188.426,1815592.687,591959.631,8.763,186879.601,...,,-765129.760,6754831.648,5989701.887,20.667,0.347,104.123,95.616,18.326,0.307
306,USA,United States,2019,5460638.447,1782330.918,664058.419,1820657.452,600230.642,9.070,183462.148,...,,-704045.088,6617916.876,5913871.788,20.156,0.332,102.013,94.843,18.012,0.297
307,USA,United States,2020,4893979.965,1607383.150,622490.837,1574166.590,548542.614,9.103,172900.413,...,,-776168.335,6025973.613,5249805.278,18.178,0.311,92.888,91.689,15.836,0.271


In [28]:
#drop the rows for years before 2016
emissions_years = pivot_emissions[pivot_emissions['Year']>=2016]

In [29]:
emissions_years

Variable,COU,Country,Year,1 - Energy,1A1 - Energy Industries,1A2 - Manufacturing industries and construction,1A3 - Transport,1A4 - Residential and other sectors,1A4 - Residential and other sectors\t,1A5 - Energy - Other,...,6 - Other,"Land use, land-use change and forestry (LULUCF)",Total emissions excluding LULUCF,Total emissions including LULUCF,Total GHG excl. LULUCF per capita,Total GHG excl. LULUCF per unit of GDP,"Total GHG excl. LULUCF, Index 1990=100","Total GHG excl. LULUCF, Index 2000=100",Total GHG incl. LULUCF per capita,Total GHG incl. LULUCF per unit of GDP
0,AUS,Australia,2016,430847.154,219696.632,41457.597,96247.370,22803.514,4.128,1107.535,...,,-39871.822,552354.829,512483.007,22.833,0.480,126.092,87.235,21.185,0.445
1,AUS,Australia,2017,433232.808,218630.996,40684.881,97873.732,23572.811,4.213,923.790,...,,-49771.570,559581.111,509809.541,22.745,0.473,127.742,88.843,20.722,0.431
2,AUS,Australia,2018,435570.379,214859.149,41667.747,100147.870,23452.846,4.182,928.399,...,,-46601.058,560827.414,514226.357,22.449,0.464,128.026,89.258,20.583,0.425
3,AUS,Australia,2019,434362.688,213954.254,41604.536,100204.512,21961.994,3.955,791.730,...,,-49387.874,555244.931,505857.057,21.890,0.459,126.752,84.409,19.943,0.419
4,AUS,Australia,2020,418708.570,207918.766,41705.481,93177.677,21377.314,3.983,947.097,...,,-42506.757,536739.718,494232.961,20.890,0.434,122.527,83.014,19.236,0.400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,USA,United States,2017,5418740.173,1907565.706,640424.560,1783274.957,532266.388,8.112,170722.800,...,,-774196.008,6561824.440,5787628.432,20.183,0.347,101.148,94.210,17.801,0.306
305,USA,United States,2018,5589457.285,1932518.915,661188.426,1815592.687,591959.631,8.763,186879.601,...,,-765129.760,6754831.648,5989701.887,20.667,0.347,104.123,95.616,18.326,0.307
306,USA,United States,2019,5460638.447,1782330.918,664058.419,1820657.452,600230.642,9.070,183462.148,...,,-704045.088,6617916.876,5913871.788,20.156,0.332,102.013,94.843,18.012,0.297
307,USA,United States,2020,4893979.965,1607383.150,622490.837,1574166.590,548542.614,9.103,172900.413,...,,-776168.335,6025973.613,5249805.278,18.178,0.311,92.888,91.689,15.836,0.271


In [30]:
total_country_emissions = emissions_years[['Country', 'Year', 'Total GHG incl. LULUCF per capita',
       'Total GHG incl. LULUCF per unit of GDP', 'Total  emissions including LULUCF',
       'Total GHG excl. LULUCF per capita',
       'Total GHG excl. LULUCF per unit of GDP','Total GHG excl. LULUCF, Index 2000=100']]

In [31]:
total_country_emissions

Variable,Country,Year,Total GHG incl. LULUCF per capita,Total GHG incl. LULUCF per unit of GDP,Total emissions including LULUCF,Total GHG excl. LULUCF per capita,Total GHG excl. LULUCF per unit of GDP,"Total GHG excl. LULUCF, Index 2000=100"
0,Australia,2016,21.185,0.445,512483.007,22.833,0.480,87.235
1,Australia,2017,20.722,0.431,509809.541,22.745,0.473,88.843
2,Australia,2018,20.583,0.425,514226.357,22.449,0.464,89.258
3,Australia,2019,19.943,0.419,505857.057,21.890,0.459,84.409
4,Australia,2020,19.236,0.400,494232.961,20.890,0.434,83.014
...,...,...,...,...,...,...,...,...
304,United States,2017,17.801,0.306,5787628.432,20.183,0.347,94.210
305,United States,2018,18.326,0.307,5989701.887,20.667,0.347,95.616
306,United States,2019,18.012,0.297,5913871.788,20.156,0.332,94.843
307,United States,2020,15.836,0.271,5249805.278,18.178,0.311,91.689


In [32]:
#change the columns type for merging
company_clean_df['Year'] = company_clean_df['REPORTING YEAR'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  company_clean_df['Year'] = company_clean_df['REPORTING YEAR'].astype(int)


In [33]:
#get rid of old columns
company_clean_df.drop(columns ='REPORTING YEAR', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  company_clean_df.drop(columns ='REPORTING YEAR', inplace=True)


In [34]:
#merge the country level emissions data with companies on the company HQ 
country_company = pd.merge(company_clean_df, total_country_emissions, 
                        on=['Country', 'Year'],
                        how='left') 

In [35]:
country_company

Unnamed: 0,COMPANY NAME,ANNUAL REVENUE,HEADCOUNT,SCOPE 1 EMISSIONS TOTAL,SCOPE 2 EMISSION TOTAL,CONSUMPTION TOTAL ENERGY,CONSUMPTION TOTAL WATER,Country,SCOPE 3,TOTAL_WASTE_GENERATED,ISIC Division,Year,Total GHG incl. LULUCF per capita,Total GHG incl. LULUCF per unit of GDP,Total emissions including LULUCF,Total GHG excl. LULUCF per capita,Total GHG excl. LULUCF per unit of GDP,"Total GHG excl. LULUCF, Index 2000=100"
0,Samsung Electronics Co,174211.0009,277000.0,2554000.0,9556000.0,21073000.0,104253000.0,Korea,Not reported,1078310.0,Manufacturing,2016,12.634,0.325,647071.270,13.542,0.348,97.841
1,Samsung Electronics Co,212024.2078,282000.0,3704500.0,9405500.0,23419000.0,120618000.0,Korea,Not reported,1146810.0,Manufacturing,2017,13.009,0.325,668185.970,13.838,0.346,100.079
2,Samsung Electronics Co,221588.2162,287000.0,4855000.0,11125000.0,26028000.0,134230000.0,Korea,Not reported,1210520.0,Manufacturing,2018,13.278,0.324,684958.960,14.094,0.344,100.567
3,Samsung Electronics Co,197683.9559,292000.0,5067000.0,10933000.0,26899000.0,134479000.0,Korea,Not reported,1099197.0,Manufacturing,2019,12.785,0.306,661820.070,13.549,0.325,98.780
4,Samsung Electronics Co,200606.1790,297000.0,5726000.0,11504000.0,29024000.0,142294000.0,Korea,Not reported,1181741.0,Manufacturing,2020,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314,HP,,49000.0,165138.0,641983.0,807122.0,,United States,Reported,34910.0,Information and communication,2017,17.801,0.306,5787628.432,20.183,0.347,94.210
315,HP,,55000.0,164075.0,594823.0,758898.0,,United States,Not reported,38900.0,Information and communication,2018,18.326,0.307,5989701.887,20.667,0.347,95.616
316,HP,,56000.0,61900.0,153900.0,663374.0,,United States,Reported,17466.0,Information and communication,2019,18.012,0.297,5913871.788,20.156,0.332,94.843
317,HP,,53000.0,50600.0,120400.0,604901.0,,United States,Reported,20260.0,Information and communication,2020,15.836,0.271,5249805.278,18.178,0.311,91.689


In [36]:
country_company.isna().sum()

COMPANY NAME                                0
ANNUAL REVENUE                             38
HEADCOUNT                                   1
SCOPE 1 EMISSIONS TOTAL                     0
SCOPE 2 EMISSION TOTAL                      0
CONSUMPTION TOTAL ENERGY                   73
CONSUMPTION TOTAL WATER                   140
Country                                     9
SCOPE 3                                     9
TOTAL_WASTE_GENERATED                     126
ISIC Division                               9
Year                                        0
Total GHG incl. LULUCF per capita          57
Total GHG incl. LULUCF per unit of GDP     57
Total  emissions including LULUCF          57
Total GHG excl. LULUCF per capita          57
Total GHG excl. LULUCF per unit of GDP     57
Total GHG excl. LULUCF, Index 2000=100     57
dtype: int64

In [37]:
# manually input missing ISIC division
#print which companies are missing the data 
country_company[country_company['ISIC Division'].isna()]


fill_dictionary = {
    'SAUDI ARAMCO': 'Mining and quarrying',
    'JOHNSON&JOHNSON': 'Manufacturing',
    'Petrobras': 'Mining and quarrying',
    'ALIBABA GRP-ADR': 'Wholesale and retail trade; repair of motor vehicles and motorcycles',
    'ICBC H': 'Financial and insurance activities'
}

country_company['ISIC Division'] = country_company.apply(
    lambda row: fill_dictionary.get(row['COMPANY NAME']) if pd.isna(row['ISIC Division']) else row['ISIC Division'],
    axis=1)

In [38]:
country_company.isna().sum()

COMPANY NAME                                0
ANNUAL REVENUE                             38
HEADCOUNT                                   1
SCOPE 1 EMISSIONS TOTAL                     0
SCOPE 2 EMISSION TOTAL                      0
CONSUMPTION TOTAL ENERGY                   73
CONSUMPTION TOTAL WATER                   140
Country                                     9
SCOPE 3                                     9
TOTAL_WASTE_GENERATED                     126
ISIC Division                               0
Year                                        0
Total GHG incl. LULUCF per capita          57
Total GHG incl. LULUCF per unit of GDP     57
Total  emissions including LULUCF          57
Total GHG excl. LULUCF per capita          57
Total GHG excl. LULUCF per unit of GDP     57
Total GHG excl. LULUCF, Index 2000=100     57
dtype: int64

In [39]:
#check 57 rows where ghg data was not found 
na_rows = country_company[country_company['Total GHG incl. LULUCF per capita'].isna()]
na_rows

Unnamed: 0,COMPANY NAME,ANNUAL REVENUE,HEADCOUNT,SCOPE 1 EMISSIONS TOTAL,SCOPE 2 EMISSION TOTAL,CONSUMPTION TOTAL ENERGY,CONSUMPTION TOTAL WATER,Country,SCOPE 3,TOTAL_WASTE_GENERATED,ISIC Division,Year,Total GHG incl. LULUCF per capita,Total GHG incl. LULUCF per unit of GDP,Total emissions including LULUCF,Total GHG excl. LULUCF per capita,Total GHG excl. LULUCF per unit of GDP,"Total GHG excl. LULUCF, Index 2000=100"
4,Samsung Electronics Co,200606.179,297000.0,5726000.0,11504000.0,29024000.0,142294000.0,Korea,Not reported,1181741.0,Manufacturing,2020,,,,,,
5,Samsung Electronics Co,244427.2226,302000.0,7604000.0,12566000.0,32322000.0,163660000.0,Korea,Not reported,1324972.0,Manufacturing,2021,,,,,,
6,SAUDI ARAMCO,264176.0,70762.0,42037852.0,14267101.0,,,,,,Mining and quarrying,2017,,,,,,
7,SAUDI ARAMCO,359204.0,67947.0,46600000.0,14700000.0,,,,,,Mining and quarrying,2018,,,,,,
8,SAUDI ARAMCO,329809.0,82000.0,52000000.0,19000000.0,,,Saudi Arabia,Not reported,232000.0,Mining and quarrying,2019,,,,,,
9,SAUDI ARAMCO,229891.0,83900.0,50200000.0,18100000.0,,,Saudi Arabia,Not reported,231000.0,Mining and quarrying,2020,,,,,,
10,SAUDI ARAMCO,400468.0,85800.0,55300000.0,15500000.0,,,Saudi Arabia,Not reported,158000.0,Mining and quarrying,2021,,,,,,
17,SOFTBANK GROUP CORP,50017.22938,59721.0,12052.0,708450.0,2262843.0,,Japan,Not reported,,Information and communication,2022,,,,,,
24,TAIWAN SEMICONDUCTOR,28091.61873,46968.0,2035510.0,5378443.0,9848000.0,42000000.0,Taiwan,Not reported,298761.0,Information and communication,2016,,,,,,
25,TAIWAN SEMICONDUCTOR,29074.08327,48602.0,2075928.0,6080212.0,12016000.0,48900000.0,Taiwan,Not reported,369745.0,Information and communication,2017,,,,,,


### 2.2 Country Income level 

In [40]:
df_income = pd.read_csv('/Users/elizavetabugaeva/Documents/Spiced/final_project/data-XHzgJ.csv')
df_income = df_income.drop(columns ='Lending category' )

In [41]:
# Replace value 'Korea, Rep.' with 'South Korea' in the 'Country' column
country_company['Country']= country_company['Country'].replace('Korea', 'Korea, Rep.')


In [42]:
# Convert the 'df_income' dataframe to a long format
df_income_long = pd.melt(df_income, 
                         id_vars=['Country', 'Income group', 'Region'], 
                         var_name='Year', 
                         value_name='Income')

# Convert 'Year' to integer for merging
df_income_long['Year'] = df_income_long['Year'].astype(int)

In [43]:
df_income_long

Unnamed: 0,Country,Income group,Region,Year,Income
0,Aruba,High income,Latin America & Caribbean,1987,
1,Afghanistan,Low income,South Asia,1987,
2,Angola,Lower middle income,Sub-Saharan Africa,1987,670.0
3,Albania,Upper middle income,Europe & Central Asia,1987,730.0
4,Andorra,High income,Europe & Central Asia,1987,
...,...,...,...,...,...
7373,Kosovo,Upper middle income,Europe & Central Asia,2020,4440.0
7374,"Yemen, Rep.",Low income,Middle East & North Africa,2020,
7375,South Africa,Upper middle income,Sub-Saharan Africa,2020,5410.0
7376,Zambia,Lower middle income,Sub-Saharan Africa,2020,1190.0


In [44]:
#merge to the country_company
company_country_inc = pd.merge(country_company,df_income_long, 
                               on = ['Country', 'Year'], how = 'left')

In [45]:
#print the total dataframe
company_country_inc

Unnamed: 0,COMPANY NAME,ANNUAL REVENUE,HEADCOUNT,SCOPE 1 EMISSIONS TOTAL,SCOPE 2 EMISSION TOTAL,CONSUMPTION TOTAL ENERGY,CONSUMPTION TOTAL WATER,Country,SCOPE 3,TOTAL_WASTE_GENERATED,...,Year,Total GHG incl. LULUCF per capita,Total GHG incl. LULUCF per unit of GDP,Total emissions including LULUCF,Total GHG excl. LULUCF per capita,Total GHG excl. LULUCF per unit of GDP,"Total GHG excl. LULUCF, Index 2000=100",Income group,Region,Income
0,Samsung Electronics Co,174211.0009,277000.0,2554000.0,9556000.0,21073000.0,104253000.0,"Korea, Rep.",Not reported,1078310.0,...,2016,12.634,0.325,647071.270,13.542,0.348,97.841,High income,East Asia & Pacific,29330.0
1,Samsung Electronics Co,212024.2078,282000.0,3704500.0,9405500.0,23419000.0,120618000.0,"Korea, Rep.",Not reported,1146810.0,...,2017,13.009,0.325,668185.970,13.838,0.346,100.079,High income,East Asia & Pacific,30300.0
2,Samsung Electronics Co,221588.2162,287000.0,4855000.0,11125000.0,26028000.0,134230000.0,"Korea, Rep.",Not reported,1210520.0,...,2018,13.278,0.324,684958.960,14.094,0.344,100.567,High income,East Asia & Pacific,32730.0
3,Samsung Electronics Co,197683.9559,292000.0,5067000.0,10933000.0,26899000.0,134479000.0,"Korea, Rep.",Not reported,1099197.0,...,2019,12.785,0.306,661820.070,13.549,0.325,98.780,High income,East Asia & Pacific,33790.0
4,Samsung Electronics Co,200606.1790,297000.0,5726000.0,11504000.0,29024000.0,142294000.0,"Korea, Rep.",Not reported,1181741.0,...,2020,,,,,,,High income,East Asia & Pacific,32860.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314,HP,,49000.0,165138.0,641983.0,807122.0,,United States,Reported,34910.0,...,2017,17.801,0.306,5787628.432,20.183,0.347,94.210,High income,North America,59460.0
315,HP,,55000.0,164075.0,594823.0,758898.0,,United States,Not reported,38900.0,...,2018,18.326,0.307,5989701.887,20.667,0.347,95.616,High income,North America,63510.0
316,HP,,56000.0,61900.0,153900.0,663374.0,,United States,Reported,17466.0,...,2019,18.012,0.297,5913871.788,20.156,0.332,94.843,High income,North America,65910.0
317,HP,,53000.0,50600.0,120400.0,604901.0,,United States,Reported,20260.0,...,2020,15.836,0.271,5249805.278,18.178,0.311,91.689,High income,North America,


In [46]:
company_country_inc.isna().sum()

COMPANY NAME                                0
ANNUAL REVENUE                             38
HEADCOUNT                                   1
SCOPE 1 EMISSIONS TOTAL                     0
SCOPE 2 EMISSION TOTAL                      0
CONSUMPTION TOTAL ENERGY                   73
CONSUMPTION TOTAL WATER                   140
Country                                     9
SCOPE 3                                     9
TOTAL_WASTE_GENERATED                     126
ISIC Division                               0
Year                                        0
Total GHG incl. LULUCF per capita          57
Total GHG incl. LULUCF per unit of GDP     57
Total  emissions including LULUCF          57
Total GHG excl. LULUCF per capita          57
Total GHG excl. LULUCF per unit of GDP     57
Total GHG excl. LULUCF, Index 2000=100     57
Income group                               73
Region                                     73
Income                                    107
dtype: int64

In [47]:
company_country_inc['Income group'].unique()

array(['High income', nan, 'Upper middle income', 'Lower middle income'],
      dtype=object)

In [48]:
na_income = company_country_inc[company_country_inc['Income group'].isna()]

In [49]:
na_income['Country'].unique()

array(['Korea, Rep.', nan, 'Saudi Arabia', 'Japan', 'Taiwan', 'China',
       'United States', 'Germany', 'France', 'Switzerland',
       'United Kingdom', 'Russia', 'India', 'Italy'], dtype=object)

In [50]:
#fill unmatched values 
# Define the conditions for filling with 'Upper middle income'
conditions = company_country_inc['Country'].isin(['Russia', 'India', 'Taiwan', 'China'])

# Fill missing 'Income group' values based on conditions
company_country_inc.loc[conditions, 'Income group'] = 'Upper middle income'
company_country_inc.loc[~conditions, 'Income group'] = 'High Income'

### 3. Scrape financials

In [51]:
pip install bs4

Note: you may need to restart the kernel to use updated packages.


In [52]:
import requests
from bs4 import BeautifulSoup

In [53]:
head = {'User_Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/109.0'}


In [54]:
companies = company_country_inc['COMPANY NAME']

In [56]:
def get_market_cap(soup):
    table_rows = soup.select('tr')
    data = []
    for row in table_rows:
        tds = row.select('td')
        if len(tds) >= 2:
            year = tds[0].text.strip()
            market_cap = tds[1].text.strip()
            data.append([year, market_cap])
    return data

def get_revenue(soup):
    table_rows = soup.select('tr')
    data = []
    for row in table_rows:
        tds = row.select('td')
        if len(tds) >= 2:
            year = tds[0].text.strip()
            revenue = tds[1].text.strip()
            data.append([year, revenue])
    return data

def get_total_assets(soup):
    table_rows = soup.select('tr')
    data = []
    for row in table_rows:
        tds = row.select('td')
        if len(tds) >= 2:
            year = tds[0].text.strip()
            assets = tds[1].text.strip()
            data.append([year, assets])
    return data

all_data = []

for company in companies:
    base_url = f'https://companiesmarketcap.com/{company.lower().replace(" ", "-")}'

    # Market Cap
    response = requests.get(base_url + '/marketcap/', headers = head)
    soup = BeautifulSoup(response.text, 'html.parser')
    market_cap_data = get_market_cap(soup)

    # Revenue
    response = requests.get(base_url + '/revenue/',headers = head)
    soup = BeautifulSoup(response.text, 'html.parser')
    revenue_data = get_revenue(soup)

    # Total Assets
    response = requests.get(base_url + '/total-assets/',headers = head)
    soup = BeautifulSoup(response.text, 'html.parser')
    assets_data = get_total_assets(soup)

    # Combine data
    for mc, rev, assets in zip(market_cap_data, revenue_data, assets_data):
        all_data.append([company] + mc + rev[1:] + assets[1:])

df = pd.DataFrame(all_data, columns=['Company', 'Year', 'Market Cap', 'Revenue', 'Total Assets'])


KeyboardInterrupt: 

In [None]:
df.to_csv('company_data.csv', index=False)

In [57]:
df = pd.read_csv('company_data.csv')

In [58]:
#drop all the rows with irrelevant year data 
fins = df[df['Year'].isin(['2016', '2017', '2018', '2019', '2020', '2021', '2022'])]

In [59]:
fins

Unnamed: 0,Company,Year,Market Cap,Revenue,Total Assets
1,SAUDI ARAMCO,2022,$1.881 T,$604.17 B,$576.72 B
2,SAUDI ARAMCO,2021,$1.908 T,$400.74 B,$510.78 B
3,SAUDI ARAMCO,2020,$2.053 T,$229.97 B,$398.60 B
4,SAUDI ARAMCO,2019,$1.880 T,$331.19 B,$358.83 B
7,SAUDI ARAMCO,2022,$1.881 T,$604.17 B,$576.72 B
...,...,...,...,...,...
2024,HP,2020,$31.71 B,$56.63 B,$33.46 B
2025,HP,2019,$29.86 B,$58.75 B,$34.62 B
2026,HP,2018,$31.78 B,$58.47 B,$32.91 B
2027,HP,2017,$34.56 B,$52.05 B,$29.01 B


In [60]:
#convert years to int
fins['Year'] = fins['Year'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fins['Year'] = fins['Year'].astype(int)


In [61]:
fins['Company'].unique()

array(['SAUDI ARAMCO', 'TENCENT', 'China Construction Bank',
       'CHINA MOBILE', 'CVS HEALTH', 'DEUTSCHE TELEKOM', 'Petrobras',
       'WALMART', 'ALLIANZ', 'APPLE', 'AXA', 'BP', 'PFIZER',
       'Reliance Industries', 'ENI', 'Puma', 'Unilever', 'Adobe', 'HP'],
      dtype=object)

In [63]:
company_country_inc['COMPANY NAME'].unique()

array(['Samsung Electronics Co', 'SAUDI ARAMCO', 'SOFTBANK GROUP CORP',
       'SONY CORP', 'TAIWAN SEMICONDUCTOR', 'TENCENT', 'CHEVRON CORP',
       'China Construction Bank', 'CHINA MOBILE', 'COMCAST CORP A (NEW)',
       'CVS HEALTH', 'DAIMLER', 'DEUTSCHE TELEKOM', 'JOHNSON&JOHNSON',
       'JPMORGAN CHASE & CO', 'LVMH MOET HENNE', 'MICROSOFT CORP',
       'NESTLE SA-REG', 'Petrobras', 'Petrochina', 'TOTAL',
       'TOYOTA MOTOR CORP', 'UNITEDHEALTH GRP', 'VERIZON COMMUNICATIONS',
       'VOLKSWAGEN VORZUG', 'WALMART', 'ABBVIE INC', 'ALIBABA GRP-ADR',
       'ALLIANZ', 'ALPHABET INC-A', 'AMAZON.COM INC', 'APPLE', 'AT&T',
       'AXA', 'BANK OF AMERICA CORP', 'BERKSHIRE HATHAWAY B', 'BMW STAMM',
       'BP', 'EXXON MOBIL CORP', 'FACEBOOK INC-A', 'FORD MOTOR CO',
       'HSBC HOLDINGS (GB)', 'ICBC H', 'INTEL CORP', 'PFIZER',
       'PING AN INS A (HK-C)', 'PJSC Gazprom', 'PROCTER & GAMBLE',
       'Reliance Industries', 'Shell Plc', 'ENI', 'Puma', 'Zalando SE',
       'Unilever', 'Ado

In [None]:
#merge to our data 
fin_merged = pd.merge(company_country_inc,fins, 
                      left_on =['COMPANY NAME', 'Year'], right_on =['Company', 'Year'], how = 'left')

In [None]:
fin_merged

Unnamed: 0,COMPANY NAME,ANNUAL REVENUE,HEADCOUNT,SCOPE 1 EMISSIONS TOTAL,SCOPE 2 EMISSION TOTAL,CONSUMPTION TOTAL ENERGY,CONSUMPTION TOTAL WATER,Country,SCOPE 3,TOTAL_WASTE_GENERATED,...,Total GHG excl. LULUCF per capita,Total GHG excl. LULUCF per unit of GDP,"Total GHG excl. LULUCF, Index 2000=100",Income group,Region,Income,Company,Market Cap,Revenue,Total Assets
0,Samsung Electronics Co,174211.0009,277000.0,2554000.0,9556000.0,21073000.0,104253000.0,"Korea, Rep.",Not reported,1078310.0,...,13.542,0.348,97.841,High Income,East Asia & Pacific,29330.0,,,,
1,Samsung Electronics Co,212024.2078,282000.0,3704500.0,9405500.0,23419000.0,120618000.0,"Korea, Rep.",Not reported,1146810.0,...,13.838,0.346,100.079,High Income,East Asia & Pacific,30300.0,,,,
2,Samsung Electronics Co,221588.2162,287000.0,4855000.0,11125000.0,26028000.0,134230000.0,"Korea, Rep.",Not reported,1210520.0,...,14.094,0.344,100.567,High Income,East Asia & Pacific,32730.0,,,,
3,Samsung Electronics Co,197683.9559,292000.0,5067000.0,10933000.0,26899000.0,134479000.0,"Korea, Rep.",Not reported,1099197.0,...,13.549,0.325,98.780,High Income,East Asia & Pacific,33790.0,,,,
4,Samsung Electronics Co,200606.1790,297000.0,5726000.0,11504000.0,29024000.0,142294000.0,"Korea, Rep.",Not reported,1181741.0,...,,,,High Income,East Asia & Pacific,32860.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
854,HP,,51000.0,48700.0,110800.0,697058.0,,United States,Reported,20760.0,...,19.103,0.309,89.861,High Income,,,HP,$40.78 B,$63.48 B,$34.68 B
855,HP,,51000.0,48700.0,110800.0,697058.0,,United States,Reported,20760.0,...,19.103,0.309,89.861,High Income,,,HP,$40.78 B,$63.48 B,$34.68 B
856,HP,,51000.0,48700.0,110800.0,697058.0,,United States,Reported,20760.0,...,19.103,0.309,89.861,High Income,,,HP,$40.78 B,$63.48 B,$34.68 B
857,HP,,51000.0,48700.0,110800.0,697058.0,,United States,Reported,20760.0,...,19.103,0.309,89.861,High Income,,,HP,$40.78 B,$63.48 B,$34.68 B


In [None]:
fin_merged.isna().sum()

COMPANY NAME                                0
ANNUAL REVENUE                            188
HEADCOUNT                                   1
SCOPE 1 EMISSIONS TOTAL                     0
SCOPE 2 EMISSION TOTAL                      0
CONSUMPTION TOTAL ENERGY                  206
CONSUMPTION TOTAL WATER                   440
Country                                    15
SCOPE 3                                    15
TOTAL_WASTE_GENERATED                     340
ISIC Division                               0
Year                                        0
Total GHG incl. LULUCF per capita         201
Total GHG incl. LULUCF per unit of GDP    201
Total  emissions including LULUCF         201
Total GHG excl. LULUCF per capita         201
Total GHG excl. LULUCF per unit of GDP    201
Total GHG excl. LULUCF, Index 2000=100    201
Income group                                0
Region                                    176
Income                                    257
Company                           