In [4]:

import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re
import numpy as np

# Make the request to the Wikipedia page
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
response = requests.get(url)

# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

# Extract the relevant information
table = soup.find('table', {'class': 'wikitable sortable'})
rows = table.find_all('tr')
data = []
for row in rows:
    cells = row.find_all('td')
    if len(cells) > 0:
        symbol = cells[0].text.strip()
        name = cells[1].find('a').get('title').replace(' ','_')
        sector = cells[3].text.strip()
        # Get the Wikipedia page for each company
        company_url = f"https://en.wikipedia.org/wiki/{name}"
        company_response = requests.get(company_url)
        company_soup = BeautifulSoup(company_response.content, 'html.parser')
        #company_text = company_soup.get_text()
        data.append({'Symbol': symbol, 'Name': name, 'Sector': sector, 'Wikipedia Entry': company_soup})
        #set system sleep to 1 second to avoid overloading the server
        time.sleep(1)

# Create a Dataframe
df = pd.DataFrame(data)


In [5]:
#safe df to csv
df.to_csv('SP500_Wikipedia.csv')

In [None]:
Dataframes = []
for i in range(0, len(df)):   
    try:
        ############################################### Read Table from Wikipedia Page ###############################################
        comp = df.iloc[i]['Wikipedia Entry']

        #find all tables in the wikipedia page
        tables = comp.find_all('table')[0]
        #find the table with the class 'infobox'
        infobox = comp.find('table', {'class': 'infobox'})

        #find  class 'infobox-label' in the infobox
        infobox_label = infobox.find_all('th', {'class': 'infobox-label'})

        #find  class 'infobox-data' in the infobox
        infobox_data = infobox.find_all('td', {'class': 'infobox-data'})

        #label text
        infobox_label_text = [i.text for i in infobox_label]

        data_vec =[]
        #loop through all entries in the infobox
        for i in range(len(infobox_data)):
            if len(infobox_data[i].find_all('div', {'class': 'plainlist'})) > 0:
                l = infobox_data[i].find_all('div', {'class': 'plainlist'})
                l=[j.text for j in l[0].find_all('li')]
                data_vec.append(l)
            else:
                data_vec.append(infobox_data[i].text)

        DF_n = pd.DataFrame({'label': infobox_label_text, 'data': data_vec})

        ############################################### Clean Data Table ###############################################
        
        #founded
        if ((DF_n['label']=='Founded').any()) and not isinstance(DF_n.loc[int(np.where(DF_n['label']=='Founded')[0])]['data'],list):
            string = DF_n[DF_n['label']=='Founded']['data'].values[0]
            DF_n.loc[DF_n['label']=='Founded','data'] = string.split(';')[0]

        #headquarters
        if(DF_n['label']=='Headquarters').any() and not isinstance(DF_n.loc[int(np.where(DF_n['label']=='Headquarters')[0])]['data'],list):
            string = DF_n[DF_n['label']=='Headquarters']['data'].values[0]
            str_split = string.split(',')
            #insert list in the dataframe
            DF_n.iat[int(np.where(DF_n['label']=='Headquarters')[0]),DF_n.columns.get_loc('data')] = str_split

        #products
        if(DF_n['label']=='Products').any() and not isinstance(DF_n.loc[int(np.where(DF_n['label']=='Products')[0])]['data'],list):
            string = DF_n[DF_n['label']=='Products']['data'].values[0]
            #split st by a upper case letter
            str_split = re.findall(r'[A-Z][^A-Z]*', string)
            DF_n.iat[int(np.where(DF_n['label']=='Products')[0]),DF_n.columns.get_loc('data')] = str_split

        #division
        if(DF_n['label']=='Divisions').any() and not isinstance(DF_n.loc[int(np.where(DF_n['label']=='Divisions')[0])]['data'],list):
            string = DF_n[DF_n['label']=='Divisions']['data'].values[0]
            #split st by a upper case letter
            str_split = string.split(',')
            DF_n.iat[int(np.where(DF_n['label']=='Divisions')[0]),DF_n.columns.get_loc('data')] = str_split

        #number of employees
        if(DF_n['label']=='Number of employees').any() and not isinstance(DF_n.loc[int(np.where(DF_n['label']=='Number of employees')[0])]['data'],list):
            string = DF_n[DF_n['label']=='Number of employees']['data'].values[0]
            string = string.replace(',','')
            #find all numbers in the string
            numbers = re.findall(r'\d+', string)[0]
            DF_n.iat[int(np.where(DF_n['label']=='Number of employees')[0]),DF_n.columns.get_loc('data')] = numbers

        #networth
        stats = ["Revenue", "Operating income", "Net income", "Total assets", "Total equity"]

        for stat in stats:
            if(DF_n['label']==stat).any() and not isinstance(DF_n.loc[int(np.where(DF_n['label']==stat)[0])]['data'],list):
                string = DF_n[DF_n['label']==stat]['data'].values[0]
                string = string.split(' (')[0]
                DF_n.iat[int(np.where(DF_n['label']==stat)[0]),DF_n.columns.get_loc('data')] = string
                
        #subsidaries
        if(DF_n['label']=='Subsidiaries').any() and not isinstance(DF_n.loc[int(np.where(DF_n['label']=='Subsidiaries')[0])]['data'],list):
            string = DF_n[DF_n['label']=='Subsidiaries']['data'].values[0]
            #split st by a upper case letter
            str_split = string.split(',')
            #insert list in the dataframe
            DF_n.iat[int(np.where(DF_n['label']=='Subsidiaries')[0]),DF_n.columns.get_loc('data')] = str_split

            Dataframes.append(DF_n)
    except:
        print('error')
        Dataframes.append('error')
    finally:
        print("The 'try except' is finished")

In [272]:
    comp = df.iloc[400]['Wikipedia Entry']

    #find all tables in the wikipedia page
    tables = comp.find_all('table')[0]
    #find the table with the class 'infobox'
    infobox = comp.find('table', {'class': 'infobox'})

    #find  class 'infobox-label' in the infobox
    infobox_label = infobox.find_all('th', {'class': 'infobox-label'})

    #find  class 'infobox-data' in the infobox
    infobox_data = infobox.find_all('td', {'class': 'infobox-data'})

    #label text
    infobox_label_text = [i.text for i in infobox_label]

    data_vec =[]
    #loop through all entries in the infobox
    for i in range(len(infobox_data)):
        if len(infobox_data[i].find_all('div', {'class': 'plainlist'})) > 0:
            l = infobox_data[i].find_all('div', {'class': 'plainlist'})
            l=[j.text for j in l[0].find_all('li')]
            data_vec.append(l)
        else:
            data_vec.append(infobox_data[i].text)

    DF_n = pd.DataFrame({'label': infobox_label_text, 'data': data_vec})

In [270]:
Dataframes

[                  label                                               data
 0                  Type                                             Public
 1             Traded as                     [NYSE: AAP, S&P 500 component]
 2              Industry                                  Auto Parts Retail
 3               Founded                                     April 29, 1932
 4               Founder                                     Arthur Taubman
 5          Headquarters                  [Raleigh,  North Carolina,  U.S.]
 6   Number of locations  4,912 Advance Stores, 150 Worldpac branches an...
 7           Area served                                United StatesCanada
 8            Key people  Jeffrey C. SmithChair, Board of DirectorsTom G...
 9              Products       [Replacement automotive parts & accessories]
 10              Revenue                                   US$10.11 Billion
 11     Operating income                                   US$749.9 Million
 12         

In [248]:

import numpy as np

#founded
if ((DF_n['label']=='Founded').any()) and not isinstance(DF_n.loc[int(np.where(DF_n['label']=='Founded')[0])]['data'],list):
    string = DF_n[DF_n['label']=='Founded']['data'].values[0]
    DF_n.loc[DF_n['label']=='Founded','data'] = string.split(';')[0]

#headquarters
if(DF_n['label']=='Headquarters').any() and not isinstance(DF_n.loc[int(np.where(DF_n['label']=='Headquarters')[0])]['data'],list):
    string = DF_n[DF_n['label']=='Headquarters']['data'].values[0]
    str_split = string.split(',')
    #insert list in the dataframe
    DF_n.iat[int(np.where(DF_n['label']=='Headquarters')[0]),DF_n.columns.get_loc('data')] = str_split

#products
if(DF_n['label']=='Products').any() and not isinstance(DF_n.loc[int(np.where(DF_n['label']=='Products')[0])]['data'],list):
    string = DF_n[DF_n['label']=='Products']['data'].values[0]
    #split st by a upper case letter
    str_split = re.findall(r'[A-Z][^A-Z]*', string)
    DF_n.iat[int(np.where(DF_n['label']=='Products')[0]),DF_n.columns.get_loc('data')] = str_split

#division
if(DF_n['label']=='Divisions').any() and not isinstance(DF_n.loc[int(np.where(DF_n['label']=='Divisions')[0])]['data'],list):
    string = DF_n[DF_n['label']=='Divisions']['data'].values[0]
    #split st by a upper case letter
    str_split = string.split(',')
    DF_n.iat[int(np.where(DF_n['label']=='Divisions')[0]),DF_n.columns.get_loc('data')] = str_split

#number of employees
if(DF_n['label']=='Number of employees').any() and not isinstance(DF_n.loc[int(np.where(DF_n['label']=='Number of employees')[0])]['data'],list):
    string = DF_n[DF_n['label']=='Number of employees']['data'].values[0]
    string = string.replace(',','')
    #find all numbers in the string
    numbers = re.findall(r'\d+', string)[0]
    DF_n.iat[int(np.where(DF_n['label']=='Number of employees')[0]),DF_n.columns.get_loc('data')] = numbers

#networth
stats = ["Revenue", "Operating income", "Net income", "Total assets", "Total equity"]

for stat in stats:
    if(DF_n['label']==stat).any() and not isinstance(DF_n.loc[int(np.where(DF_n['label']==stat)[0])]['data'],list):
        string = DF_n[DF_n['label']==stat]['data'].values[0]
        string = string.split(' (')[0]
        DF_n.iat[int(np.where(DF_n['label']==stat)[0]),DF_n.columns.get_loc('data')] = string
        
#subsidaries
if(DF_n['label']=='Subsidiaries').any() and not isinstance(DF_n.loc[int(np.where(DF_n['label']=='Subsidiaries')[0])]['data'],list):
    string = DF_n[DF_n['label']=='Subsidiaries']['data'].values[0]
    #split st by a upper case letter
    str_split = string.split(',')
    #insert list in the dataframe
    DF_n.iat[int(np.where(DF_n['label']=='Subsidiaries')[0]),DF_n.columns.get_loc('data')] = str_split

DF_n


Unnamed: 0,label,data
0,Type,Public
1,Traded as,"[NYSE: IP, S&P 500 component]"
2,Industry,Pulp and paper
3,Founded,1898
4,Headquarters,"[Memphis, Tennessee, United States]"
5,Area served,Worldwide
6,Key people,Mark Sutton(Chairman & CEO)
7,Revenue,US$20.580 billion
8,Operating income,US$1.849 billion
9,Net income,US$482 million


In [246]:
string = DF_n[DF_n['label']=='Subsidiaries']['data'].values[0]
    #split st by a upper case letter
str_split = string.split(',')
str_split
    #insert list in the dataframe
DF_n.iat[int(np.where(DF_n['label']=='Headquarters')[0]),DF_n.columns.get_loc('data')] = str_split
DF_n

Unnamed: 0,label,data
0,Type,Public
1,Traded as,"[NYSE: IP, S&P 500 component]"
2,Industry,Pulp and paper
3,Founded,1898
4,Headquarters,"[Temple-Inland, Hammermill Paper Company, Ol..."
5,Area served,Worldwide
6,Key people,Mark Sutton(Chairman & CEO)
7,Revenue,US$20.580 billion
8,Operating income,US$1.849 billion
9,Net income,US$482 million
