In [2]:

import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re
import numpy as np

# Make the request to the Wikipedia page
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
response = requests.get(url)

# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

# Extract the relevant information
table = soup.find('table', {'class': 'wikitable sortable'})
rows = table.find_all('tr')
data = []
for row in rows:
    cells = row.find_all('td')
    if len(cells) > 0:
        symbol = cells[0].text.strip()
        name = cells[1].find('a').get('title').replace(' ','_')
        sector = cells[3].text.strip()
        # Get the Wikipedia page for each company
        company_url = f"https://en.wikipedia.org/wiki/{name}"
        company_response = requests.get(company_url)
        company_soup = BeautifulSoup(company_response.content, 'html.parser')
        #company_text = company_soup.get_text()
        data.append({'Symbol': symbol, 'Name': name, 'Sector': sector, 'Wikipedia Entry': company_soup})
        #set system sleep to 1 second to avoid overloading the server
        time.sleep(0.5)

# Create a Dataframe
df = pd.DataFrame(data)


In [2]:

#df.to_csv('SP500_Wikipedia.csv')
#df =pd.read_csv('C:/Users/gabri/OneDrive/Desktop/FH_Stuff/DSI/SP500_Wikipedia.csv')

In [3]:
Dataframes = []
for t in range(0, len(df)):   
    try:
        ############################################### Read Table from Wikipedia Page ###############################################
        comp = df.iloc[t]['Wikipedia Entry']

        #find all tables in the wikipedia page
        tables = comp.find_all('table')[0]
        #find the table with the class 'infobox'
        infobox = comp.find('table', {'class': 'infobox'})

        #find  class 'infobox-label' in the infobox
        infobox_label = infobox.find_all('th', {'class': 'infobox-label'})

        #find  class 'infobox-data' in the infobox
        infobox_data = infobox.find_all('td', {'class': 'infobox-data'})

        #label text
        infobox_label_text = [i.text for i in infobox_label]

        data_vec =[]
        #loop through all entries in the infobox
        for i in range(len(infobox_data)):
            if len(infobox_data[i].find_all('div', {'class': 'plainlist'})) > 0:
                l = infobox_data[i].find_all('div', {'class': 'plainlist'})
                l=[j.text for j in l[0].find_all('li')]
                data_vec.append(l)
            else:
                data_vec.append(infobox_data[i].text)

        DF_n = pd.DataFrame({'label': infobox_label_text, 'data': data_vec})

        ############################################### Clean Data Table ###############################################
        
        #founded
        if ((DF_n['label']=='Founded').any()) and not isinstance(DF_n.loc[int(np.where(DF_n['label']=='Founded')[0])]['data'],list):
            string = DF_n[DF_n['label']=='Founded']['data'].values[0]
            DF_n.loc[DF_n['label']=='Founded','data'] = string.split(';')[0]

        #headquarters
        if(DF_n['label']=='Headquarters').any() and not isinstance(DF_n.loc[int(np.where(DF_n['label']=='Headquarters')[0])]['data'],list):
            string = DF_n[DF_n['label']=='Headquarters']['data'].values[0]
            str_split = string.split(',')
            #insert list in the dataframe
            DF_n.iat[int(np.where(DF_n['label']=='Headquarters')[0]),DF_n.columns.get_loc('data')] = str_split

        #products
        if(DF_n['label']=='Products').any() and not isinstance(DF_n.loc[int(np.where(DF_n['label']=='Products')[0])]['data'],list):
            string = DF_n[DF_n['label']=='Products']['data'].values[0]
            #split st by a upper case letter
            str_split = re.findall(r'[A-Z][^A-Z]*', string)
            DF_n.iat[int(np.where(DF_n['label']=='Products')[0]),DF_n.columns.get_loc('data')] = str_split

        #division
        if(DF_n['label']=='Divisions').any() and not isinstance(DF_n.loc[int(np.where(DF_n['label']=='Divisions')[0])]['data'],list):
            string = DF_n[DF_n['label']=='Divisions']['data'].values[0]
            #split st by a upper case letter
            str_split = string.split(',')
            DF_n.iat[int(np.where(DF_n['label']=='Divisions')[0]),DF_n.columns.get_loc('data')] = str_split

        #number of employees
        if(DF_n['label']=='Number of employees').any() and not isinstance(DF_n.loc[int(np.where(DF_n['label']=='Number of employees')[0])]['data'],list):
            string = DF_n[DF_n['label']=='Number of employees']['data'].values[0]
            string = string.replace(',','')
            #find all numbers in the string
            numbers = re.findall(r'\d+', string)[0]
            DF_n.iat[int(np.where(DF_n['label']=='Number of employees')[0]),DF_n.columns.get_loc('data')] = numbers

        #networth
        stats = ["Revenue", "Operating income", "Net income", "Total assets", "Total equity"]

        for stat in stats:
            if(DF_n['label']==stat).any() and not isinstance(DF_n.loc[int(np.where(DF_n['label']==stat)[0])]['data'],list):
                string = DF_n[DF_n['label']==stat]['data'].values[0]
                string = string.split(' (')[0]
                DF_n.iat[int(np.where(DF_n['label']==stat)[0]),DF_n.columns.get_loc('data')] = string
                
        #subsidaries
        if(DF_n['label']=='Subsidiaries').any() and not isinstance(DF_n.loc[int(np.where(DF_n['label']=='Subsidiaries')[0])]['data'],list):
            string = DF_n[DF_n['label']=='Subsidiaries']['data'].values[0]
            #split st by a upper case letter
            str_split = string.split(',')
            #insert list in the dataframe
            DF_n.iat[int(np.where(DF_n['label']=='Subsidiaries')[0]),DF_n.columns.get_loc('data')] = str_split

        Dataframes.append(DF_n.to_json())
        #Dataframes.append(DF_n)
    except:
        Dataframes.append(pd.DataFrame({'label': ['no info'], 'data': ['missing']}).to_json())
    finally:
        #print("The 'try except' is finished")
        next

In [5]:
#import json as js
#save Dataframes[100] to a json file
#with open('Dataframes.json', 'w') as outfile:
#    js.dump(Dataframes[100], outfile)

#Dataframes[0]

'{"label":{"0":"Formerly","1":"Type","2":"Traded as","3":"ISIN","4":"Industry","5":"Founded","6":"Founders","7":"Headquarters","8":"Area served","9":"Key people","10":"Revenue","11":"Operating income","12":"Net income","13":"Total assets","14":"Total equity","15":"Number of employees","16":"Website"},"data":{"0":"Minnesota Mining and Manufacturing Company (1902\\u20132002)","1":"Public","2":["NYSE:\\u00a0MMM","DJIA component","S&P 100 component","S&P 500 component"],"3":"US88579Y1010","4":"Conglomerate","5":"June\\u00a013, 1902","6":["J. Danley Budd","Henry. S. Bryan","William A. McGonagle","John Dwan","Hermon W. Cable[2]"],"7":["Maplewood"," Minnesota"," U.S."],"8":"Worldwide","9":"Mike Roman (Chairman, President, & CEO)[3]","10":" US$35.36 billion","11":" US$7.37 billion","12":" US$5.92 billion","13":" US$47.07 billion","14":" US$15.05 billion","15":"95000","16":"3m.com"}}'

In [6]:
#json to pandas dataframe
df['Wikipedia Entry'] = Dataframes

In [7]:
df

Unnamed: 0,Symbol,Name,Sector,Wikipedia Entry
0,MMM,3M,Industrials,"{""label"":{""0"":""Formerly"",""1"":""Type"",""2"":""Trade..."
1,AOS,A._O._Smith,Industrials,"{""label"":{""0"":""Type"",""1"":""Traded as"",""2"":""Indu..."
2,ABT,Abbott_Laboratories,Health Care,"{""label"":{""0"":""Type"",""1"":""Traded as"",""2"":""Indu..."
3,ABBV,AbbVie,Health Care,"{""label"":{""0"":""Type"",""1"":""Traded as"",""2"":""Indu..."
4,ACN,Accenture,Information Technology,"{""label"":{""0"":""Formerly"",""1"":""Type"",""2"":""Trade..."
...,...,...,...,...
498,YUM,Yum!_Brands,Consumer Discretionary,"{""label"":{""0"":""Formerly"",""1"":""Type"",""2"":""Trade..."
499,ZBRA,Zebra_Technologies,Information Technology,"{""label"":{""0"":""Type"",""1"":""Traded as"",""2"":""Indu..."
500,ZBH,Zimmer_Biomet,Health Care,"{""label"":{""0"":""Formerly"",""1"":""Type"",""2"":""Trade..."
501,ZION,Zions_Bancorporation,Financials,"{""label"":{""0"":""Type"",""1"":""Traded as"",""2"":""Indu..."
