In [1]:
#Author: @michaelbrink
#Org: BalloonBox Inc.

In [2]:
from bs4 import BeautifulSoup
import requests
import json
from html.parser import HTMLParser
import urllib.request
import string
import random
import re
import pandas as pd
import numpy as np

In [3]:
#Need a JS enabled web service
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import selenium.webdriver.chrome.service as service
from selenium.webdriver.chrome.options import Options
import time

# Imported the data files

In [4]:
df_founder = pd.read_excel('forbes2017-2020_ceofounder.xlsx',index_col=0)
df_female = pd.read_excel('forbes2017-2020_femaleceo.xlsx',index_col=0)
df_all = pd.read_excel('forbes2017-2020_nofilter.xlsx',index_col=0)

## Data Cleaning

In [5]:
df_all.head()

Unnamed: 0,Rank,Name,Revenue ($M),Revenue % change,Profit ($M),Profit % change,Assets ($M),Market Value ($M),Change in rank (1000),Employees,Change in rank (500),Year
0,1,Walmart,"$485,873",0.8%,"$13,643",-7.2%,"$198,825","$218,619",2300000,-,-,2017
1,2,Berkshire Hathaway,"$223,604",6.1%,"$24,074",0.0%,"$620,854","$411,035",367700,2,2,2017
2,3,Apple,"$215,639",-7.7%,"$45,687",-14.4%,"$321,686","$753,718",116000,-,-,2017
3,4,Exxon Mobil,"$205,004",-16.7%,"$7,840",-51.5%,"$330,314","$340,056",72700,-2,-2,2017
4,5,McKesson,"$192,487",6.2%,"$2,258",53%,"$56,563","$31,439",68000,-,-,2017


In [6]:
# The orders of features in 2017 is different from other years, so we need to manually switch them
temp_employees = df_all[df_all['Year']==2017]['Change in rank (1000)']
temp_rank_1000 = df_all[df_all['Year']==2017]['Employees']
df_all.loc[df_all['Year']==2017,'Change in rank (1000)'] = temp_rank_1000
df_all.loc[df_all['Year']==2017,'Employees'] = temp_employees

In [7]:
# Append the Cat column to dataFrame
df_all['Cat'] = 'No filter'
df_female['Cat'] = 'Female CEO'
df_founder['Cat'] = 'Founder CEO'

In [8]:
# Concat all three datasets into one
df = pd.concat([df_all,df_female,df_founder],axis=0)
df.head()

Unnamed: 0,Rank,Name,Revenue ($M),Revenue % change,Profit ($M),Profit % change,Assets ($M),Market Value ($M),Change in rank (1000),Employees,Change in rank (500),Year,Cat
0,1,Walmart,"$485,873",0.8%,"$13,643",-7.2%,"$198,825","$218,619",-,2300000,-,2017,No filter
1,2,Berkshire Hathaway,"$223,604",6.1%,"$24,074",0.0%,"$620,854","$411,035",2,367700,2,2017,No filter
2,3,Apple,"$215,639",-7.7%,"$45,687",-14.4%,"$321,686","$753,718",-,116000,-,2017,No filter
3,4,Exxon Mobil,"$205,004",-16.7%,"$7,840",-51.5%,"$330,314","$340,056",-2,72700,-2,2017,No filter
4,5,McKesson,"$192,487",6.2%,"$2,258",53%,"$56,563","$31,439",-,68000,-,2017,No filter


In [9]:
# Rank
df['Rank'] = df['Rank'].replace(to_replace='[,]',value='',regex=True)
df['Rank'] = df['Rank'].replace(to_replace='^-$',value='0',regex=True)
df['Rank'] = pd.to_numeric(df['Rank'],errors='coerce')

In [10]:
# Revenue ($M)
df['Revenue ($M)'] = df['Revenue ($M)'].replace(to_replace='[\$,]',value='',regex=True)
df['Revenue ($M)'] = df['Revenue ($M)'].replace(to_replace='^-$',value='0',regex=True)
df['Revenue ($M)'] = pd.to_numeric(df['Revenue ($M)'],errors='coerce')

In [11]:
# Revenue % change
df['Revenue % change'] = df['Revenue % change'].replace(to_replace='%',value='',regex=True)
df['Revenue % change'] = df['Revenue % change'].replace(to_replace='^-$',value='0',regex=True)
df['Revenue % change'] = pd.to_numeric(df['Revenue % change'],errors='coerce')

In [12]:
# Profit ($M)
df['Profit ($M)'] = df['Profit ($M)'].replace(to_replace='[\$,]',value='',regex=True)
df['Profit ($M)'] = df['Profit ($M)'].replace(to_replace='^-$',value='0',regex=True)
df['Profit ($M)'] = pd.to_numeric(df['Profit ($M)'],errors='coerce')

In [13]:
# Profit % change
df['Profit % change'] = df['Profit % change'].replace(to_replace='%',value='',regex=True)
df['Profit % change'] = df['Profit % change'].replace(to_replace='^-$',value='0',regex=True)
df['Profit % change'] = pd.to_numeric(df['Profit % change'],errors='coerce')

In [14]:
# Assets ($M)
df['Assets ($M)'] = df['Assets ($M)'].replace(to_replace='[\$,]',value='',regex=True)
df['Assets ($M)'] = df['Assets ($M)'].replace(to_replace='^-$',value='0',regex=True)
df['Assets ($M)'] = pd.to_numeric(df['Assets ($M)'],errors='coerce')

In [15]:
# Market Value ($M)
df['Market Value ($M)'] = df['Market Value ($M)'].replace(to_replace='[\$,]',value='',regex=True)
df['Market Value ($M)'] = df['Market Value ($M)'].replace(to_replace='^-$',value='0',regex=True)
df['Market Value ($M)'] = pd.to_numeric(df['Market Value ($M)'],errors='coerce')

In [16]:
# Change in rank (1000)
df['Change in rank (1000)'] = df['Change in rank (1000)'].replace(to_replace='[,]',value='',regex=True)
df['Change in rank (1000)'] = df['Change in rank (1000)'].replace(to_replace='^-$',value='0',regex=True)
df['Change in rank (1000)'] = pd.to_numeric(df['Change in rank (1000)'],errors='coerce')

In [17]:
# Employees
df['Employees'] = df['Employees'].replace(to_replace='[,]',value='',regex=True)
df['Employees'] = pd.to_numeric(df['Employees'],errors='coerce')

In [18]:
# Change in rank (500)
df['Change in rank (500)'] = df['Change in rank (500)'].replace(to_replace='[,]',value='',regex=True)
df['Change in rank (500)'] = df['Change in rank (500)'].replace(to_replace='^-$',value='0',regex=True)
df['Change in rank (500)'] = pd.to_numeric(df['Change in rank (500)'],errors='coerce')

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4287 entries, 0 to 106
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Rank                   4287 non-null   int64  
 1   Name                   4287 non-null   object 
 2   Revenue ($M)           4287 non-null   float64
 3   Revenue % change       4287 non-null   float64
 4   Profit ($M)            4287 non-null   float64
 5   Profit % change        4287 non-null   float64
 6   Assets ($M)            4287 non-null   float64
 7   Market Value ($M)      4287 non-null   float64
 8   Change in rank (1000)  4287 non-null   int64  
 9   Employees              4287 non-null   int64  
 10  Change in rank (500)   4287 non-null   int64  
 11  Year                   4287 non-null   int64  
 12  Cat                    4287 non-null   object 
dtypes: float64(6), int64(5), object(2)
memory usage: 468.9+ KB


In [20]:
df.to_excel('cleaned_overall_dataset.xlsx')