In [2]:
import os
import gdown

In [3]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [4]:
# Step 1: Download CSV from Google Drive
def download_csv_from_google_drive(url, output_path):
    gdown.download(url, output_path, quiet=False)

In [5]:
# Step 2: Read the CSV to get Twitter profile URLs
def load_profile_urls(csv_path):
    df = pd.read_csv(csv_path)
    return df['twitter_profile_url'].tolist()  # Assuming the CSV has a column named 'twitter_profile_url'

In [17]:

# Step 3: Scrape Twitter profile details
def scrape_twitter_profile(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # Initializing profile data dictionary
    profile_data = {
        "Bio": None,
        "Following Count": None,
        "Followers Count": None,
        "Location": None,
        "Website": None
    }
     # Extracting the bio
    bio_tag = soup.find('div', {'data-testid': 'UserDescription'})
    profile_data['Bio'] = bio_tag.text if bio_tag else None
     # Extracting the following count
    following_tag = soup.find('a', {'href': f"{url}/following"})
    profile_data['Following Count'] = following_tag.find('span').text if following_tag else None
    # Extracting the followers count
    followers_tag = soup.find('a', {'href': f"{url}/followers"})
    profile_data['Followers Count'] = followers_tag.find('span').text if followers_tag else None
     # Extracting location
    location_tag = soup.find('span', {'data-testid': 'UserLocation'})
    profile_data['Location'] = location_tag.text if location_tag else None
    # Extracting website
    website_tag = soup.find('a', {'data-testid': 'UserUrl'})
    profile_data['Website'] = website_tag['href'] if website_tag else None
    return profile_data

In [18]:
# Step 4: Save data to a CSV
def save_data_to_csv(data, output_path):
    df = pd.DataFrame(data)
    df.to_csv(output_path, index=False)
    print(f"Data saved to {output_path}")

In [25]:
# Main script
if __name__ == "__main__":
    # Download and load profile URLs
    csv_url = 'https://drive.google.com/uc?id=1PLYwrGn5YApyWU2QpjbdhM6tea0HuGq7'  # Direct download link for Google Drive
    input_csv = 'profiles.csv'
    output_csv = 'twitter_profiles_data.csv'
    # Download the CSV from Google Drive
    download_csv_from_google_drive(csv_url, input_csv)
    

Downloading...
From: https://drive.google.com/uc?id=1PLYwrGn5YApyWU2QpjbdhM6tea0HuGq7
To: C:\Users\user\profiles.csv
100%|██████████████████████████████████████████████████████████████████████████████████| 492/492 [00:00<00:00, 449kB/s]


In [29]:
import pandas as pd

# Step 1: Read the CSV without headers to get Twitter profile URLs
def load_profile_urls(csv_path):
    try:
        # Read the CSV file without headers
        df = pd.read_csv(csv_path, header=None)  # header=None treats the first row as data, not as column names
        
        # Print the first few rows to verify the data
        print("First few rows of the CSV file:", df.head())
        
        # Access the first column (index 0) as it contains the URLs
        profile_urls = df[0].tolist()  # Assuming URLs are in the first column
        
        return profile_urls
    
    except FileNotFoundError:
        print(f"Error: The file '{csv_path}' was not found.")
        return []
    except pd.errors.EmptyDataError:
        print("Error: The file is empty.")
        return []
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

# Usage
input_csv = 'profiles.csv'  # Path to your local CSV file
profile_urls = load_profile_urls(input_csv)
print("Loaded profile URLs:", profile_urls)


First few rows of the CSV file:                                       0
0            https://twitter.com/GTNUK1
1          https://twitter.com/whatsapp
2     https://twitter.com/aacb_CBPTrade
3        https://twitter.com/aacbdotcom
4  https://twitter.com/@AAWindowPRODUCT
Loaded profile URLs: ['https://twitter.com/GTNUK1', 'https://twitter.com/whatsapp', 'https://twitter.com/aacb_CBPTrade', 'https://twitter.com/aacbdotcom', 'https://twitter.com/@AAWindowPRODUCT', 'https://www.twitter.com/aandb_kia', 'https://twitter.com/ABHomeInc', 'https://twitter.com/Abrepro', 'http://www.twitter.com', 'https://twitter.com/ACChristofiLtd', 'https://twitter.com/aeclothing1', 'http://www.twitter.com/', 'https://twitter.com/AETechnologies1', 'http://www.twitter.com/wix', 'https://twitter.com/AGInsuranceLLC']


In [31]:
 # Scrape each profile and collect data
scraped_data = []
for url in profile_urls:
    print(f"Scraping {url}")
    data = scrape_twitter_profile(url)
    data['URL'] = url
    scraped_data.append(data)

Scraping https://twitter.com/GTNUK1
Scraping https://twitter.com/whatsapp
Scraping https://twitter.com/aacb_CBPTrade
Scraping https://twitter.com/aacbdotcom
Scraping https://twitter.com/@AAWindowPRODUCT
Scraping https://www.twitter.com/aandb_kia
Scraping https://twitter.com/ABHomeInc
Scraping https://twitter.com/Abrepro
Scraping http://www.twitter.com
Scraping https://twitter.com/ACChristofiLtd
Scraping https://twitter.com/aeclothing1
Scraping http://www.twitter.com/
Scraping https://twitter.com/AETechnologies1
Scraping http://www.twitter.com/wix
Scraping https://twitter.com/AGInsuranceLLC


In [33]:
# Save all collected data to a CSV file
save_data_to_csv(scraped_data, output_csv)

Data saved to twitter_profiles_data.csv
