In [222]:
import pandas as pd
import datetime as dt
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('ggplot')
sns.set_theme(style= 'darkgrid')

In [223]:
df = pd.read_csv('cleaned_biggest_companies.csv')
df.head()

Unnamed: 0,Company Name,Country Founded,Year Founded,Revenue 2018,Revenue 2019,Revenue 2020,Net Income 2018,Net Income 2019,Net Income 2020,Industry
0,Amazon,United States,1994,232.9,280.5,386.1,21.3,21.3,21.3,Retail
1,Apple,United States,1976,265.6,260.2,274.5,59.5,55.2,111.4,Technology
2,Google,United States,1998,136.8,161.9,182.3,30.7,34.3,46.2,Technology
3,Microsoft,United States,1975,110.4,130.9,143.0,16.6,44.3,53.0,Technology
4,Alibaba,China,1999,39.9,56.1,70.2,12.9,14.9,11.9,Retail


In [224]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 274 entries, 0 to 273
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Company Name     274 non-null    object 
 1   Country Founded  274 non-null    object 
 2   Year Founded     274 non-null    int64  
 3   Revenue 2018     274 non-null    float64
 4   Revenue 2019     274 non-null    float64
 5   Revenue 2020     274 non-null    float64
 6   Net Income 2018  274 non-null    float64
 7   Net Income 2019  274 non-null    float64
 8   Net Income 2020  274 non-null    float64
 9   Industry         274 non-null    object 
dtypes: float64(6), int64(1), object(3)
memory usage: 21.5+ KB


In [225]:
# Top 5 industries containing some of the worlds biggest companies
df['Industry'].value_counts().head()

Banking               38
Technology            20
Telecommunications    20
Automotive            15
Oil and gas           14
Name: Industry, dtype: int64

In [255]:
revenue_df = df.melt(id_vars= 'Company Name', value_vars= ['Revenue 2018','Revenue 2019','Revenue 2020'])
income_df = df.melt(id_vars= 'Company Name', value_vars= ['Net Income 2018','Net Income 2019','Net Income 2020'])


In [227]:
income_df.head()

Unnamed: 0,Company Name,variable,value
0,Amazon,Net Income 2018,21.3
1,Apple,Net Income 2018,59.5
2,Google,Net Income 2018,30.7
3,Microsoft,Net Income 2018,16.6
4,Alibaba,Net Income 2018,12.9


In [256]:
revenue_df['variable'] = revenue_df['variable'].str.replace('Revenue ','',)
income_df['variable'] = income_df['variable'].str.replace('Net Income ','',)

In [257]:
revenue_df.sort_values(by='Company Name').head(6)

Unnamed: 0,Company Name,variable,value
726,ACS,2020,57.4
452,ACS,2019,56.8
178,ACS,2018,56.2
151,AMP,2018,16.3
699,AMP,2020,16.7
425,AMP,2019,16.5


In [258]:
income_df.sort_values(by='Company Name').head(6)

Unnamed: 0,Company Name,variable,value
726,ACS,2020,7.6
452,ACS,2019,6.6
178,ACS,2018,5.6
151,AMP,2018,1.3
699,AMP,2020,3.3
425,AMP,2019,2.3


In [259]:
revenue_df['variable'] = pd.to_datetime(revenue_df['variable'])
income_df['variable'] = pd.to_datetime(income_df['variable'])

In [260]:
df_new = pd.merge(revenue_df, income_df, left_on=['Company Name','variable'], right_on=['Company Name','variable'],)
df_new.sort_values(by=['Company Name','variable']).head(9)

Unnamed: 0,Company Name,variable,value_x,value_y
196,ACS,2018-01-01,56.2,5.6
490,ACS,2019-01-01,56.8,6.6
784,ACS,2020-01-01,57.4,7.6
168,AMP,2018-01-01,16.3,1.3
462,AMP,2019-01-01,16.5,2.3
756,AMP,2020-01-01,16.7,3.3
159,ANZ,2018-01-01,27.6,6.6
453,ANZ,2019-01-01,28.2,5.6
747,ANZ,2020-01-01,28.8,4.6


In [261]:
# Quick verification with above output
df.sort_values(by='Company Name').head(3)

Unnamed: 0,Company Name,Country Founded,Year Founded,Revenue 2018,Revenue 2019,Revenue 2020,Net Income 2018,Net Income 2019,Net Income 2020,Industry
178,ACS,Spain,1997,56.2,56.8,57.4,5.6,6.6,7.6,Construction and services
151,AMP,Australia,1849,16.3,16.5,16.7,1.3,2.3,3.3,Financial services
142,ANZ,Australia,1951,27.6,28.2,28.8,6.6,5.6,4.6,Banking


In [262]:
df_final = pd.merge(df_new, df[['Company Name','Country Founded','Year Founded', 'Industry']].copy(), on='Company Name')
df_final.sort_values(by=['Company Name','variable']).head(6)

Unnamed: 0,Company Name,variable,value_x,value_y,Country Founded,Year Founded,Industry
696,ACS,2018-01-01,56.2,5.6,Spain,1997,Construction and services
697,ACS,2019-01-01,56.8,6.6,Spain,1997,Construction and services
698,ACS,2020-01-01,57.4,7.6,Spain,1997,Construction and services
600,AMP,2018-01-01,16.3,1.3,Australia,1849,Financial services
601,AMP,2019-01-01,16.5,2.3,Australia,1849,Financial services
602,AMP,2020-01-01,16.7,3.3,Australia,1849,Financial services


In [263]:
df_final.rename(columns= {'variable': 'Year', 'value_x': 'Revenue', 'value_y': 'Net Income', 
    'Country Founded_x': 'Country Founded', 'Year Founded_x': 'Year Founded', 'Industry_x': 'Industry'}, inplace=True)
#df_final.drop(['Country Founded_y', 'Year Founded_y', 'Industry_y'], axis= 1, inplace=True)
df_final.head()

Unnamed: 0,Company Name,Year,Revenue,Net Income,Country Founded,Year Founded,Industry
0,Amazon,2018-01-01,232.9,21.3,United States,1994,Retail
1,Amazon,2019-01-01,280.5,21.3,United States,1994,Retail
2,Amazon,2020-01-01,386.1,21.3,United States,1994,Retail
3,Apple,2018-01-01,265.6,59.5,United States,1976,Technology
4,Apple,2019-01-01,260.2,55.2,United States,1976,Technology


In [264]:
df_final[['Company Name','Year', 'Revenue', 'Net Income']].drop_duplicates(keep= 'first',inplace=True)
df_final[['Company Name','Year','Revenue', 'Net Income']].duplicated().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final[['Company Name','Year', 'Revenue', 'Net Income']].drop_duplicates(keep= 'first',inplace=True)


161

In [265]:
df_final['Company Name'].value_counts().head(15)

Danske Bank             24
Johnson & Johnson       24
Unilever                24
BP                      24
Goldman Sachs           24
Chevron                 24
Nordea Bank             24
Royal Dutch Shell       24
MTN Group               24
Anheuser-Busch InBev    24
Itaú Unibanco            3
Banco Bradesco           3
Vale                     3
Samsung                  3
Banco do Brasil          3
Name: Company Name, dtype: int64

In [266]:
df_final[df_final['Company Name']== 'MTN Group']

Unnamed: 0,Company Name,Year,Revenue,Net Income,Country Founded,Year Founded,Industry
756,MTN Group,2018-01-01,11.6,1.5,South Africa,1994,Telecommunications
757,MTN Group,2018-01-01,11.6,1.5,Nigeria,1994,Telecommunications
758,MTN Group,2018-01-01,11.6,0.5,South Africa,1994,Telecommunications
759,MTN Group,2018-01-01,11.6,0.5,Nigeria,1994,Telecommunications
760,MTN Group,2018-01-01,3.3,1.5,South Africa,1994,Telecommunications
761,MTN Group,2018-01-01,3.3,1.5,Nigeria,1994,Telecommunications
762,MTN Group,2018-01-01,3.3,0.5,South Africa,1994,Telecommunications
763,MTN Group,2018-01-01,3.3,0.5,Nigeria,1994,Telecommunications
764,MTN Group,2019-01-01,11.9,1.6,South Africa,1994,Telecommunications
765,MTN Group,2019-01-01,11.9,1.6,Nigeria,1994,Telecommunications
