# WEB SCRAPING

#### *Luis Magalhaes*

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

### TABLES

#### FROM WIKIPEDIA-LIKE SOURCES

In [3]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'}
url = "https://en.wikipedia.org/wiki/List_of_national_capitals"
r = requests.get(url, headers=headers)

In [4]:
soup = BeautifulSoup(r.content, "html.parser")
table = soup.find_all('table')[1]
rows = table.find_all('tr')
row_list = list()

In [5]:
for tr in rows:
    td = tr.find_all('td')
    row = [i.text for i in td]
    row_list.append(row)

In [6]:
df_bs = pd.DataFrame(row_list,columns=['City','Country','Notes'])
df_bs.set_index('Country',inplace=True)
df_bs.to_csv('beautifulsoup.csv')

In [9]:
print(df_bs.shape)
df_bs.tail()

(263, 2)


Unnamed: 0_level_0,City,Notes
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Namibia,Windhoek,\n
Cameroon,Yaoundé,\n
Nauru,Yaren (de facto),Country does not have an official capital; how...
Armenia,Yerevan,\n
Croatia,Zagreb,\n


____

### ANYTHING (in this example, forum comments)

#### WITH SELENIUM

In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import pandas as pd
import time

In [11]:
url = 'https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans/p400'
driver = webdriver.Chrome()  # in the parenthesis we can specify the chromedriver path: (executable_path='chromedriver')
driver.get(url)

In [12]:
comments = pd.DataFrame(columns = ['Date','user_id','comments']) 
#ids = driver.find_elements_by_xpath("//*[contains(@id,'Comment_')]")
comment_ids = []

In [13]:
def retrieve_comments_ids():
    #cleaning the list for the next iteration
    comment_ids[:] = []
    ids = driver.find_elements_by_xpath("//*[contains(@id,'Comment_')]")
    for i in ids:
        comment_ids.append(i.get_attribute('id'))

def populate_data():
    ids = driver.find_elements_by_xpath("//*[contains(@id,'Comment_')]")
    for x in comment_ids:
        #Extract dates from for each user on a page
        user_date = driver.find_elements_by_xpath('//*[@id="' + x +'"]/div/div[2]/div[2]/span[1]/a/time')[0]
        date = user_date.get_attribute('title')

        #Extract user ids from each user on a page
        userid_element = driver.find_elements_by_xpath('//*[@id="' + x +'"]/div/div[2]/div[1]/span[1]/a[2]')[0]
        userid = userid_element.text

        #Extract Message for each user on a page
        user_message = driver.find_elements_by_xpath('//*[@id="' + x +'"]/div/div[3]/div/div[1]')[0]
        comment = user_message.text
                                   
        #Adding date, userid and comment for each user in a dataframe    
        comments.loc[len(comments)] = [date,userid,comment]

In [14]:
pageCounter = 400
while pageCounter < 405:
    retrieve_comments_ids()
    populate_data()
    driver.find_element_by_xpath('//body/div[1]/main/div/div[3]/div[3]/span/span/a[11]').click()
    
    time.sleep(10)
    pageCounter +=1
    print(pageCounter)

401
402
403
404
405


In [15]:
print(comments.shape)
comments.tail()

(250, 3)


Unnamed: 0,Date,user_id,comments
245,"May 4, 2016 8:39AM",qbrozen,Michaell said:\nshow previous quotes\nI read a...
246,"May 4, 2016 10:54AM",kyfdx,Spotted an STS-V on Monday. That has to be pre...
247,"May 4, 2016 11:44AM",roadburner,I don't know what Cadillac is going to do; the...
248,"May 4, 2016 2:19PM",laurasdada,"Perhaps review their pricing model, study Lexu..."
249,"May 4, 2016 3:16PM",roadburner,laurasdada said:\nPerhaps review their pricing...


### Now we can analyse the data collected. In this example, let's extract the frequency that each relevant word is used in these comments.

In [16]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\luisa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\luisa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [17]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

In [18]:
comments_text = comments['comments'].str.cat(sep=' ')

In [19]:
tokens = word_tokenize(comments_text)

In [20]:
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in tokens]

In [21]:
print(stemmed[:10])

['well', '.', 'could', 'be', 'someth', 'interest', 'to', 'consid', 'in', '2']


In [22]:
words = [word for word in stemmed if word.isalpha()]
stop_words = set(stopwords.words('english'))

In [23]:
words = [w for w in words if not w in stop_words]

In [24]:
freq = []
for w in words:
    freq.append(words.count(w))

In [25]:
word_freq = dict(zip(words, freq))

In [27]:
frequent_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

In [28]:
print('The most frequent relevant words that appear in these comments are:\n' + str(list(frequent_words)))

The most frequent relevant words that appear in these comments are:
[('I', 503), ('car', 202), ('wa', 116), ('said', 110), ('bmw', 89), ('would', 85), ('one', 83), ('get', 83), ('ha', 78), ('like', 74), ('thi', 74), ('mile', 64), ('good', 60), ('new', 57), ('engin', 57), ('time', 54), ('know', 54), ('It', 53), ('go', 53), ('think', 52), ('dealer', 51), ('also', 48), ('audi', 47), ('year', 45), ('want', 45), ('brake', 44), ('make', 43), ('realli', 43), ('seem', 43), ('ani', 43), ('point', 43), ('look', 41), ('well', 40), ('doe', 40), ('servic', 40), ('onli', 39), ('back', 38), ('use', 38), ('say', 37), ('could', 34), ('drive', 34), ('much', 34), ('replac', 34), ('price', 33), ('thing', 33), ('My', 32), ('veri', 32), ('cost', 32), ('shop', 32), ('number', 32), ('right', 30), ('part', 30), ('come', 30), ('leas', 30), ('front', 30), ('issu', 29), ('So', 29), ('vehicl', 28), ('even', 28), ('chip', 28), ('reliabl', 28), ('rear', 28), ('peopl', 27), ('If', 27), ('differ', 26), ('warranti', 26

### From such list, we can extract the most commented features related to the forum's theme.

#### By cross-referencing the words with their respective sentences (as groups of words) using ML algorithms, it's possible even to extract possible meanings and contexts for these mentions.