# Spanish Verb Scraper

## Objectives
- Extract Verbs from webpage
- Create a List that combines all verbs from differents urls
- reshape list to create arrays
- Store data in DataFrame 

In [1]:
# Import necessary libraries

from bs4 import BeautifulSoup
import requests
import random
import numpy as np
import pandas as pd


In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common import exceptions

## Extract Verbs from webpage

In [3]:
def scraper(url):
    # access to web page using requests lib

    service = Service(ChromeDriverManager().install()) 

    # we will use webdriver from selenuim to get the content from the web 
    driver = webdriver.Chrome(service=service) 
    response = driver.get(url)
    
    return driver.page_source



In [4]:
# Create soup to put in our html content
url = 'https://www.verbling.com/fr/discussion/2000-verbos-populares-en-espanol-en-conjugador-reverso-dale-'
soup = BeautifulSoup(scraper(url), 'html.parser')
title = soup.title.text.strip()
title 

'2000 verbos populares en español en conjugador reverso ( dale clic a los vínculos)  | Verbling'

In [5]:
# Obtain all urls that contain our verbs and store them in a list
a = soup.find_all('a', attrs={'rel':'nofollow noopener noreferrer'})
urls = []
for url in a :
    print(url.get('href'))
    urls.append(url.get('href'))

https://conjugador.reverso.net/index-espanol-1-250.html
https://conjugador.reverso.net/index-espanol-251-500.html
https://conjugador.reverso.net/index-espanol-501-750.html
https://conjugador.reverso.net/index-espanol-751-1000.html
https://conjugador.reverso.net/index-espanol-1001-1250.html
https://conjugador.reverso.net/index-espanol-1251-1500.html
https://conjugador.reverso.net/index-espanol-1501-1750.html
https://conjugador.reverso.net/index-espanol-1751-2000.html


In [6]:
for url in urls:
    soup = BeautifulSoup(scraper(url), 'html.parser')
    title = soup.title
    if title is not None :
        print(title.text.strip())


Verbos más usados en Español desde ser hasta cortar | Conjugador Reverso
Verbos más usados en Español desde vigilar hasta reafirmar | Conjugador Reverso
Verbos más usados en Español desde dañar hasta liderar | Conjugador Reverso
Verbos más usados en Español desde rehabilitar hasta acuñar | Conjugador Reverso
Verbos más usados en Español desde transportar hasta contabilizar | Conjugador Reverso
Verbos más usados en Español desde saquear hasta aludir | Conjugador Reverso
Verbos más usados en Español desde embalar hasta atemorizar | Conjugador Reverso
Verbos más usados en Español desde atrasar hasta precautelar | Conjugador Reverso


In [7]:
# Now after we succeffuly collect our all contenaire that store our verb we will insert all verbs in a single list
# We trait the first contenair after we loop all together
soup_2 = BeautifulSoup(scraper('https://conjugador.reverso.net/index-espanol-1-250.html'), 'html.parser')

## Create a List that combines all verbs from differents urls

In [8]:
verbs = soup_2.find('div', attrs = {'class': 'index-content'})
all_verbs = []
for verb in verbs:
    v  = verb.text.strip()
    if verb.text != '\n':
        all_verbs.append(v.split('\n\n\n'))     

In [9]:
# list to collect all verbs
all_verbs = []

for url in urls:
    
    # iterate access urls
    soup_2 = BeautifulSoup(scraper(url), 'html.parser')

    verbs = soup_2.find('div', attrs = {'class': 'index-content'})
    
    for verb in verbs:

        v  = verb.text.strip()

        if verb.text != '\n':
            
            all_verbs.append(v.split('\n\n\n'))

In [10]:
# show how much nested list we have
len(all_verbs) # if there are 8 so all contenaires are included

8

## reshape list to create arrays

In [11]:
# Remove multiple dimensions from list
verbs = []
for nestedlist in all_verbs:
    for vrb in nestedlist:
        verbs.append(vrb)
verbs.append(' ')
    

In [12]:
# Convert list to numpy array
arr = np.array(verbs)

In [13]:
# Reshape array to 100 rows and 20 columns
array = arr.reshape(100, 20)
array

array([['ser', 'ver', 'decir', ..., 'seguir', 'promover', 'mantener'],
       ['dejar', 'evitar', 'dar', ..., 'llevar', 'aplicar',
        'desarrollar'],
       ['facilitar', 'apoyar', 'determinar', ..., 'esperar', 'salir',
        'matar'],
       ...,
       ['desatascar', 'fruncir', 'corroer', ..., 'vivificar', 'cebar',
        'enaltecer'],
       ['extraviar', 'hartar', 'hilvanar', ..., 'aparear', 'espabilar',
        'graznar'],
       ['readquirir', 'zarandear', 'capitanear', ..., 'fustigar',
        'precautelar', ' ']], dtype='<U17')

## Store data in DataFrame 

In [14]:
# Convert array to DataFrame
df = pd.DataFrame(array )

# Rename rows and columns
df.columns =  ['Verbo  ' + str(i) for i in range(1, 21)]
df.index =  ['Dia ' + str(i) for i in range(1, 101)]

In [15]:
# Convert DataFrame to csv
import_csv = df.to_csv('2000_verbos.csv')