In [600]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

In [601]:
def get_url(name):
    try:
        query = "+".join(name.split())
        r = requests.get(f"https://www.imdb.com/search/name/?name={query}")
        soup = BeautifulSoup(r.text, 'html.parser')
        apendix = soup.find("div", {"class": "lister-item mode-detail"}).find("a").get("href")
        url = "https://www.imdb.com" + apendix
        return url
    except:
        return None

In [602]:
def get_roles(soup):
    try:
        roles_arr = []
        roles = soup.find("div", {"class": "filmo-category-section"})
        for role in roles.find_all("div", recursive=False):
            if len(role.find_all("a", {"class": "in_production"})) == 0:
                res = ""
                for val in role.find_all(text=True, recursive=False):
                    if val.strip() != "":
                        res = val.strip()
                roles_arr.append(res + " in " + str(role.find("a").text))
                if len(roles_arr) > 2:
                    break
        return roles_arr
    except:
        return None

In [603]:
def get_video_url(soup):
    try:
        apendix = soup.find("div", {"class": "heroWidget"}).find("a").get("href")
        video_url = "https://www.imdb.com" + apendix
        return video_url
    except:
        return None

In [604]:
def get_bio(soup, url):
    try:
        r = requests.get(url +"/bio")
        soup = BeautifulSoup(r.text)
        bio = soup.find("div", {"class": "soda odd"}).find("p").text.strip()
        return bio
    except:
        return None

In [605]:
def get_other_works(soup):
    try:
        res = ""
        arr = soup.find("div", {"id": "details-other-works"}).find_all(text = True, recursive = False)
        for val in arr:
            if val.strip() != "":
                res = val.strip()
                break
        return res
    except:
        return None

In [606]:
def get_spouse(soup):
    try:
        spouse = soup.find("div", {"id": "details-spouses"}).find("a").text
        return spouse
    except:
        return None

In [607]:
def get_alternate_names(soup):
    try:
        res = []
        akas = soup.find("div", {"id": "details-akas"}).find_all(text = True, recursive = False)
        for val in akas:
            if val.strip() != "":
                res.append(val.strip())
        return str(", ".join(res))
    except:
        return None

In [608]:
def get_children(soup):
    try:
        res = []
        arr = soup.find("div", {"id": "details-children"}).find_all(text = True, recursive = True)
        for val in arr:
            if (val.strip() != "") and (val.strip() != "Children:") and (val.strip() != "|"):
                res.append(val.strip())
        return " | ".join(res)
    except:
        return None

In [609]:
def get_parents(soup):
    try:
        res = []
        arr = soup.find("div", {"id": "details-parents"}).find_all(text = True, recursive = True)
        for val in arr:
            if (val.strip() != "") and (val.strip() != "Parents:") and (val.strip() != "|"):
                res.append(val.strip())
        return " | ".join(res)
    except:
        return None

In [610]:
def get_quotes(soup):
    try:
        res = []
        arr = soup.find("div", {"id": "dyk-personal-quote"}).find_all(text = True, recursive = True)
        for val in arr:
                if (val.strip() != "") and (val.strip() != "Personal Quote:") and (val.strip() != "»") and (val != "See more"):
                    res.append(val.strip())
        return " | ".join(res)
    except:
        return None

In [611]:
def get_trivia(soup):
    try:
        res = []
        arr = soup.find("div", {"id": "dyk-trivia"}).find_all(text = True, recursive = True)
        for val in arr:
                if (val.strip() != "") and (val.strip() != "Trivia:") and (val.strip() != "»") and (val != "See more"):
                    res.append(val.strip())
        return " | ".join(res)
    except:
        return None

In [612]:
def get_trademark(soup):
    try:
        res = []
        arr = soup.find("div", {"id": "dyk-trademark"}).find_all(text = True, recursive = True)
        for val in arr:
                if (val.strip() != "") and (val.strip() != "Trademark:") and (val.strip() != "»") and (val != "See more"):
                    res.append(val.strip())
        return " | ".join(res)
    except:
        return None

In [613]:
def get_nickname(soup):
    try:
        res = []
        arr = soup.find("div", {"id": "dyk-nickname"}).find_all(text = True, recursive = True)
        for val in arr:
                if (val.strip() != "") and (val.strip() != "Nickname:") and (val.strip() != "»") and (val != "See more"):
                    res.append(val.strip())
        return " | ".join(res)
    except:
        return None

In [622]:
def main():
    names = ["Ian Somerhalder", "Selena Gomez", "David Henry", "Adam Driver", "Adam Sandler"]
    columns = ["Person name", "URL", "Role 1", "Role 2", "Role 3", "Video", "Actor description",
                                "Other works", "Alternate names", "Spouse", "Children", "Parents", "Personal quotes",
                                "Trivia", "Trademark", "Nickname"]
    df_main = pd.DataFrame(columns = columns)
    for name in tqdm(names):
        url = get_url(name)
        headers = {"Accept-Language": "en,en-gb;q=0.5"}
        r = requests.get(url, headers=headers)
        soup = BeautifulSoup(r.text)
        d = {columns[0]: [name], columns[1]: [url], columns[2]: [get_roles(soup)[0]], columns[3]: [get_roles(soup)[1]], 
            columns[4]: [get_roles(soup)[2]], columns[5]: [get_video_url(soup)], columns[6]: [get_bio(soup, url)], 
            columns[7]: [get_other_works(soup)], columns[8]: [get_alternate_names(soup)], columns[9]: [get_spouse(soup)],
            columns[10]: [get_children(soup)], columns[11]: [get_parents(soup)], columns[12]: [get_quotes(soup)],
            columns[13]: [get_trivia(soup)], columns[14]: [get_trademark(soup)], columns[15]: [get_nickname(soup)]}
        df_temp = pd.DataFrame.from_dict(d)
        df_main = pd.concat([df_main, df_temp], ignore_index = True)
    df_main.to_excel("output.xlsx")
    return df_main
    

In [623]:
if __name__ == "__main__":
    main()

100%|█████████████████████████████████████████████| 5/5 [00:13<00:00,  2.73s/it]


In [6]:
import pandas as pd 
df = pd.read_excel("Directors_3022.xlsx")

In [9]:
names = []
for director in df["Directors"]:
    names.append(director)

In [10]:
names

['Alan Taylor',
 'Christopher McQuarrie',
 'Gregory Jacobs',
 'Jeremy Saulnier',
 'Michael Showalter',
 'Rob Cohen',
 'Ingo Vollkammer',
 'Riccardo Spinotti',
 'A.L. Vijay',
 'A.T. White',
 'Aaron Barsky, Andre Young Snell',
 'Aaron Harvey',
 'Aaron Horvath, Peter Rida Michail',
 'Aaron Jay Rome',
 'Aaron K. Carter',
 'Aaron Katz',
 'Aaron Moorhead',
 'Aaron Nee',
 'Aaron Sorkin',
 'Aaron Woodley',
 'Abby Kohn, Marc Silverstein',
 'Abdellatif Kechiche',
 'Abe Forsythe',
 'Abel Ferrara',
 'Abhay Chopra',
 'Abhiraj Minawala',
 'Abhishek Varman',
 'Abner Pastoll',
 'Abolhassan Davoodi',
 'Adam Collins, Luke Radford',
 'Adam Egypt Mortimer',
 'Adam Green',
 'Adam Lipsius',
 'Adam MacDonald',
 'Adam McKay',
 'Adam Nee',
 'Adam Randall',
 'Adam Rifkin',
 'Adam Robitel',
 'Adam Shankman',
 'Adam Sigal',
 'Adam Wingard',
 'Adil El Arbi',
 'Adil El Arbi, Bilall Fallah',
 'Adina Pintilie',
 'Aditya Dhar',
 'Adrian Grunberg',
 'Adrian Lyne',
 'Adrian Molina',
 'Adrian Shergold',
 'Adrian Teh',
 '