In [1]:
from selenium import webdriver
import time
import re
from bs4 import BeautifulSoup
import pandas as pd
from twitter import *
from typing import List
import shutil
import json
from tqdm import tqdm

import os
from dotenv import load_dotenv
load_dotenv()
TOKEN = os.getenv("TOKEN")
TOKEN_SECRET = os.getenv("TOKEN_SECRET")
CONSUMER_KEY = os.getenv("CONSUMER_KEY")
CONSUMER_SECRET = os.getenv("CONSUMER_SECRET")

In [2]:
def get_driver():
    # chrome_options = webdriver.ChromeOptions()
    # chrome_options.add_argument('--headless')
    # chrome_options.add_argument('--no-sandbox')
    # chrome_options.add_argument('--disable-dev-shm-usage')
    # wd = webdriver.Chrome('chromedriver', options=chrome_options)
    options = webdriver.FirefoxOptions()
    options.add_argument("--headless")
    wd = webdriver.Firefox(options=options)
    return wd


def get_coalition_links():
    wd = get_driver()
    wd.get("https://www.sejm.gov.pl/sejm9.nsf/kluby.xsp")
    links_soup = BeautifulSoup(wd.page_source, 'lxml')
    coalition_links = [l['href'] for l in links_soup.find_all('a') if l['href'] is not None and "klubposlowie" in l['href']]
    return coalition_links

def get_coalition_politicians(coalition_link):
    whole_link = "https://www.sejm.gov.pl" + coalition_link
    wd = get_driver()
    wd.get(whole_link)
    soup = BeautifulSoup(wd.page_source, 'lxml')
    people = [d.text for d in soup.find_all("div", class_="deputyName")]
    return people

In [2]:
coalition_links = get_coalition_links()
coalition_links

['/sejm9.nsf/klubposlowie.xsp?klub=PiS',
 '/sejm9.nsf/klubposlowie.xsp?klub=KO',
 '/sejm9.nsf/klubposlowie.xsp?klub=Lewica',
 '/sejm9.nsf/klubposlowie.xsp?klub=PSL-Kukiz15',
 '/sejm9.nsf/klubposlowie.xsp?klub=Konfederacja',
 '/sejm9.nsf/klubposlowie.xsp?klub=niez.']

In [32]:
def get_possible_accounts(person_name: str, coalition_name: str, n_first=3) -> List:
    twitter = Twitter(auth=OAuth(TOKEN, TOKEN_SECRET, CONSUMER_KEY, CONSUMER_SECRET))
    results = twitter.users.search(q = person_name)
    name = person_name.replace(' ', '_')
    path = f"users/{coalition_name}/{name}"
    if os.path.exists(path):
        shutil.rmtree(path)
    os.mkdir(path)
    possible_accounts = []
    for i, user in enumerate(results):
        if i >= n_first:
            break
        possible_accounts.append((user['screen_name'], user['verified']))
        with open(os.path.join(path, f"{name}_{i}.json"), "w", encoding='utf-8') as f:
            json.dump(user, f, ensure_ascii=False)
    return possible_accounts

In [34]:
n_first = 3

results = []
base_href = "https://twitter.com/"

for link in coalition_links:
    coalition_name = link.split("=")[-1]
    print(coalition_name)
    coalition_path = f"users/{coalition_name}"
    if os.path.exists(coalition_path):
        shutil.rmtree(coalition_path)
    os.mkdir(coalition_path)
    politicians = get_coalition_politicians(link)
    for politician_name in tqdm(politicians):
        possible_accounts = get_possible_accounts(politician_name, coalition_name, n_first)
        possible_links = [(base_href + name, verified) for name, verified in possible_accounts]
        data_flat = [item for t in possible_links for item in t]
        res = (coalition_name, politician_name, *data_flat)
        results.append(res)


columns=["coalition", "name"]
for i in range(n_first):
    columns.append(f"acc_{i}")
    columns.append(f"ver_{i}")
result_df = pd.DataFrame(results, columns=columns)
result_df.to_csv("res.csv", index=False)
result_df

PiS
KO
Lewica
PSL-Kukiz15
Konfederacja
niez.


100%|██████████| 234/234 [01:07<00:00,  3.47it/s]
100%|██████████| 134/134 [00:38<00:00,  3.49it/s]
100%|██████████| 48/48 [00:14<00:00,  3.35it/s]
100%|██████████| 30/30 [00:08<00:00,  3.44it/s]
100%|██████████| 11/11 [00:03<00:00,  2.97it/s]
100%|██████████| 2/2 [00:00<00:00,  3.30it/s]


Unnamed: 0,coalition,name,acc_0,ver_0,acc_1,ver_1,acc_2,ver_2
0,PiS,Adamczyk Andrzej,https://twitter.com/AMAdamczyk,False,https://twitter.com/adamczyk89a,False,https://twitter.com/Adamczyk___,False
1,PiS,Andruszkiewicz Adam,https://twitter.com/Andruszkiewicz1,False,https://twitter.com/PopieramyAA,False,https://twitter.com/MrDiamondGames4,False
2,PiS,Andzel Waldemar,https://twitter.com/AndzelWaldemar,False,,,,
3,PiS,Ardanowski Jan Krzysztof,https://twitter.com/jkardanowski,False,,,,
4,PiS,Arent Iwona,https://twitter.com/IwonaArent,False,https://twitter.com/arent_iwona,False,https://twitter.com/iwona_arent,False
...,...,...,...,...,...,...,...,...
454,Konfederacja,Tuduj Krzysztof,https://twitter.com/TudujKrzysztof,False,https://twitter.com/KTuduj,False,,
455,Konfederacja,Urbaniak Michał,,,,,,
456,Konfederacja,Winnicki Robert,https://twitter.com/RobertWinnicki,False,https://twitter.com/BobWinnicki,False,https://twitter.com/winnicki_robert,False
457,niez.,Galla Ryszard,https://twitter.com/Ryszard_Galla,False,https://twitter.com/GallaRyszard,False,https://twitter.com/RyszardGalla,False
