# Data Scraping

In [None]:
import pandas as pd
import numpy as np
import time

### Scraping Following List using Selenium

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import StaleElementReferenceException

In [None]:
# Twitter usernames to scrape
twitter_usernames = ['CreativeScots', 'BordersBookFest', 'TradDanceScot', 'scottishmusic', 'GlasgowTramway', 'makemanifesto', 'summerhallery']

In [None]:
# Function to scrape followed accounts
def scrape_following(usernames):
    # Launch Chrome browser
    driver = webdriver.Chrome()

    # Open Twitter login page
    driver.get("https://twitter.com/login")
    # Wait for some time to load
    time.sleep(2)

    # Locate and fill in the username field
    username_field = driver.find_element(By.NAME, "text")
    username_field.send_keys("USERNAME")                       ######## Replace with your Twitter username
    # Submit the login form
    username_field.send_keys(Keys.RETURN)
    # Wait for login to complete
    time.sleep(2)
    
    ### Use this commented part of code if twitter asks additionally for email/phone number
    
#     # Locate and fill in the username field
#     username_field = driver.find_element(By.NAME, "text")
#     username_field.send_keys("EMAIL/PHONE NUMBER")           ######## Replace with your Twitter email/phone number
#     # Submit the login form
#     username_field.send_keys(Keys.RETURN)
#     # Wait for login to complete
#     time.sleep(2)

    # Locate and fill in the password field
    password_field = driver.find_element(By.NAME, "password")
    password_field.send_keys("PASSWORD")                         ######## Replace with your Twitter password
    # Submit the login form
    password_field.send_keys(Keys.RETURN)
    # Wait for login to complete
    time.sleep(2)
    
    # Extracting the following accounts by navigating to users following page
    following_accounts_dict = {}
    for user in usernames:
        # Navigate to the target user's following page
        driver.get(f"https://twitter.com/{user}/following")
        # Wait for the following page to load
        time.sleep(2)

        # Height of the viewport
        height_viewport = driver.execute_script("return window.innerHeight;")

        # Assign the length of the page to a variable before scrolling down
        last_height = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, '//div[@data-testid="cellInnerDiv"]//div[@class="css-146c3p1 r-dnmrzs r-1udh08x r-3s2u2q r-bcqeeo r-1ttztb7 r-qvutc0 r-37j5jr r-a023e6 r-rjixqe r-16dba41 r-18u37iz r-1wvb978"]'))))
        following_accounts =[]
        while True:
            try:
                # Scrolling down the following accounts page
                driver.execute_script(f"window.scrollBy(0, {height_viewport});")
                # Wait for the page to load
                WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.XPATH, '//div[@data-testid="cellInnerDiv"]//div[@class="css-146c3p1 r-dnmrzs r-1udh08x r-3s2u2q r-bcqeeo r-1ttztb7 r-qvutc0 r-37j5jr r-a023e6 r-rjixqe r-16dba41 r-18u37iz r-1wvb978"]')))
                WebDriverWait(driver, 10).until(lambda driver: len(driver.find_elements(By.XPATH, '//div[@data-testid="cellInnerDiv"]//div[@class="css-146c3p1 r-dnmrzs r-1udh08x r-3s2u2q r-bcqeeo r-1ttztb7 r-qvutc0 r-37j5jr r-a023e6 r-rjixqe r-16dba41 r-18u37iz r-1wvb978"]')) > last_height)

                # Extract the usernames of following accounts
                following_list = driver.find_elements(By.XPATH, '//div[@data-testid="cellInnerDiv"]//div[@class="css-146c3p1 r-dnmrzs r-1udh08x r-3s2u2q r-bcqeeo r-1ttztb7 r-qvutc0 r-37j5jr r-a023e6 r-rjixqe r-16dba41 r-18u37iz r-1wvb978"]//span[@class="css-1jxf684 r-bcqeeo r-1ttztb7 r-qvutc0 r-poiln3"]')
                accounts = [account.text for account in following_list]
                for i in accounts:
                    if i not in following_accounts:
                        following_accounts.append(i)

            except StaleElementReferenceException:
                #print("StaleElementReferenceException encountered. Retrying...")
                time.sleep(1)
                continue
            
            except TimeoutException:
                break
        
        following_accounts_dict[user] = following_accounts
    driver.quit()
    return following_accounts_dict

# Call the function to scrape followed accounts
following_accounts_dict = scrape_following(twitter_usernames)

In [None]:
following_accounts_dict

In [None]:
# Removing @ from each username
following_accounts_dict_ = {key: [item.replace('@', '') for item in value] for key, value in following_accounts_dict.items()}
following_accounts_dict_

### Scraping Number of Followers using Snscrape

In [None]:
# Scraping the number of followers for each user
import snscrape.modules.twitter as sntwitter

def get_follower_count(screen_name):
    try:
        user = sntwitter.TwitterUserScraper(screen_name).entity
        return user.followersCount
    except Exception as e:
        print(f"Error: {e}")
        return None

# List of Twitter accounts
no_of_followers_list_=[]
for i in following_accounts_dict_:
    twitter_accounts = following_accounts_dict_[i]
    no_of_followers_ ={}
    for account in twitter_accounts:
        count = get_follower_count(account)
        if count is not None:
            no_of_followers_[account]=count
    no_of_followers_list_.append(no_of_followers_)

In [None]:
no_of_followers_list_

In [None]:
# Top 10 accounts in following list of each user
accounts_to_scrape_ = []
for i in range(len(no_of_followers_list_)):
    following_df = pd.DataFrame({'username': list(no_of_followers_list_[i].keys()), 'followers_count': list(no_of_followers_list_[i].values())}).sort_values(by='followers_count', ascending=False).reset_index(drop=True)
    top10_accounts = following_df['username'][0:10].tolist()
    accounts_to_scrape_ += top10_accounts
    
print(accounts_to_scrape_)

### Scraping Tweets using Ntscraper

In [None]:
from ntscraper import Nitter
from IPython.display import FileLink
scraper = Nitter(log_level=1, skip_instance_check = False)

In [None]:
# Function to scarpe the tweets 
def get_tweets(name, modes, no):
    tweets = scraper.get_tweets(name, mode = modes, number = no)
    final_tweet = []
    for tweet in tweets['tweets']:
        data = [tweet['link'], tweet['text'], tweet['user']['name'], tweet['user']['profile_id'], tweet['date'], tweet['stats']['comments'],  tweet['stats']['retweets'],  tweet['stats']['quotes'], tweet['stats']['likes']]
        final_tweet.append(data)
    return final_tweet

# Function to extract Twitter account name from URL
import re
def twitter_account_name(url):
    match = re.search(r'twitter\.com/([^/]+)/', url)
    if match:
        return match.group(1)
    return None

# Creating a dataframe to store the tweets
complete_df = pd.DataFrame(columns = ['link', 'text', 'user_name', 'profile_id', 'date', 'no_comments', 'no_retweets', 'no_quotes', 'no_likes'])

# Calling the above function to scrape the tweets of each user
all_twitter_accounts = accounts_to_scrape_

all_accounts=[]
for accounts in all_twitter_accounts:
    try:
        data = get_tweets(accounts, 'user', 1000)
        all_accounts.append(data)
        time.sleep(5)
    except IndexError as e:
        print("IndexError occured. Retrying......")
        time.sleep(5)
        data = get_tweets(accounts, 'user', 1000)
        all_accounts.append(data)
        time.sleep(5)
        continue

for account in all_accounts:
    # Convert the account dictionary to a DataFrame and concatenate it with the main DataFrame
    df = pd.DataFrame(account, columns = ['link', 'text', 'user_name', 'profile_id', 'date', 'no_comments', 'no_retweets', 'no_quotes', 'no_likes'])
    complete_df = pd.concat([complete_df, df], ignore_index=True)

# Apply the twitter_account_name function to the 'link' column to extract account name
complete_df['account_name'] = complete_df['link'].apply(twitter_account_name)

# Scraping the followers count of each unique account
unique_accounts = complete_df['account_name'].unique()
accounts_followers_dict_={}
for name in unique_accounts:
    count = get_follower_count(name)
    accounts_followers_dict_[name]=count
    
# Appending the followers count to dataframe
complete_df['followers'] = complete_df['account_name'].map(accounts_followers_dict_)

# Dropping rows with na values
complete_df = complete_df.dropna()

# Converting Dataframe to csv
complete_df.to_csv('twitter_data.csv', index=False)
FileLink('twitter_data.csv')