### Scraping Actors Wiki data using BeautifulSoup 
#### Dependency - installation of BeautifulSoup Package

In [18]:
! pip install beautifulsoup4

Collecting install


  ERROR: Could not find a version that satisfies the requirement install (from versions: none)
ERROR: No matching distribution found for install


In [35]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import urllib3
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

### Wiki list of American actors

In [36]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

BASE_URL = 'https://en.wikipedia.org'
WIKI_ACTORS_URL = BASE_URL+ '/wiki/Category:American_male_film_actors'
total_added = 0

In [37]:
def get_soup(url):
    http = urllib3.PoolManager()
    r = http.request("GET", url)
    return BeautifulSoup(r.data, 'html.parser')

#### Text preprocessing -  citation removals

In [51]:
# Remove citations and punctuations
def preprocessingText(text):
    text = re.sub("[\(\[].*?[\)\]]", "", text)
    sentences = nltk.sent_tokenize(text)
    tokens = []
    temp = ""
    
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        
        # Removing punctuations except '<.>/<?>/<!>'
        punctuations = '"#$%&\'()*+,-/:;<=>@\\^_`{|}~'
        words = map(lambda x: x.translate(str.maketrans('', '', punctuations)), words)
        
        # Remove empty strings
        words = filter(lambda x: len(x) > 0, words)
      
        tokens = tokens + list(words)
        temp = ' '.join(word for word in tokens)
        
    return temp

#### Parsing each actor webpage and writing into a text file

In [47]:
# write the given content into text file with name <title>.txt
def write_text_into_file(title, data):
    filename = ".\ActorsDataset\\" + title + ".txt"
    f = open(filename, 'w+', encoding="utf-8")
    f.write(data)
    f.close()
    print("Text file " + filename + " created")
        
# Parse each actor webpage content
def parse_actor_content(link):
    soup = get_soup(link)
    results = soup.find_all("div", {"class": "mw-parser-output"})[0]
    no_of_paragraphs = 0
    paragraphs = results.find_all('p')
    data = ""
    for para in paragraphs:
        if para.id != "mw-empty-elt":
            data += para.text.strip() +"\n"
            no_of_paragraphs += 1
        if no_of_paragraphs == 3:
            break
            
#   extracting sentences from the paragraph
    data = ".".join(data.split(".")[:2])
    modified_data = preprocessingText(data+".") 
    return modified_data

#### Parsing all actors' content

In [48]:
# iterate through every group
def parse_all_actors_from_wiki(url):
    soup = get_soup(url)
    results = soup.find_all("div", {"class":"mw-category-group"})
    no_of_actors = 0
    for res in results:
        # iterator through every actor or <li> element
        li_list = res.find_next('ul').find_all('li')
        for li in li_list:
            name = li.a.text.strip()
            link = li.a['href'].strip()
            data = parse_actor_content(BASE_URL+link)
            write_text_into_file(name, data)
            no_of_actors += 1
            if no_of_actors == 25:
                break
    print(no_of_actors)

In [49]:
def main():
    parse_all_actors_from_wiki(WIKI_ACTORS_URL)

In [50]:
if __name__ == "__main__":
    main()

Text file .\ActorsDataset\50 Cent.txt created
Text file .\ActorsDataset\Lee Aaker.txt created
Text file .\ActorsDataset\Willie Aames.txt created
Text file .\ActorsDataset\Quinton Aaron.txt created
Text file .\ActorsDataset\Victor Aaron.txt created
Text file .\ActorsDataset\Abbott and Costello.txt created
Text file .\ActorsDataset\Bruce Abbott.txt created
Text file .\ActorsDataset\Bud Abbott.txt created
Text file .\ActorsDataset\Christopher Abbott.txt created
Text file .\ActorsDataset\Philip Abbott.txt created
Text file .\ActorsDataset\Richard Abbott (actor).txt created
Text file .\ActorsDataset\Jake Abel.txt created
Text file .\ActorsDataset\Walter Abel.txt created
Text file .\ActorsDataset\Zachary Abel.txt created
Text file .\ActorsDataset\F. Murray Abraham.txt created
Text file .\ActorsDataset\Jon Abrahams.txt created
Text file .\ActorsDataset\Omid Abtahi.txt created
Text file .\ActorsDataset\Yousef Abu-Taleb.txt created
Text file .\ActorsDataset\Kirk Acevedo.txt created
Text file .\