In [13]:
#Python Project - Give list of URLs - Return URL, Response Code, and Page Title

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [3]:
# Specify the path to your Excel file
excel_file_path = r"D:\Ahrefs\Test_for_SF.xlsx"

# Load the Excel file into a pandas DataFrame
df = pd.read_excel(excel_file_path, sheet_name="Sheet1")

In [4]:
df.head()

Unnamed: 0,Harvested Landing Page
0,http://336realty.com/
1,http://400realestate.com/
2,http://alexhayeshomes.com/
3,http://amyjamros-realtor.com
4,http://a-statusrealty.com/


In [6]:
df2 = df["Harvested Landing Page"]
df3 = df2.iloc[0:100] #getting the first 100 rows or edit as how many rows needed

In [7]:
import concurrent.futures
def get_page_info(url):
    try:
        response = requests.get(url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            title = soup.title.string if soup.title else "No title"
            return url, response.status_code, title
        else:
            return url, response.status_code, f"Error: {response.status_code}"

    except Exception as e:
        return url, -1, str(e)

def get_page_info_parallel(urls, num_workers=20):
    page_info_list = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        # Use the map method to parallelize fetching page info for multiple URLs
        page_info_list = list(executor.map(get_page_info, urls))

    return page_info_list

if __name__ == "__main__":
    # list of URLs you want to fetch information for
    urls = df3

    # Call the function to fetch page information (URL, response code, title) for the list of URLs in parallel
    page_info_list = get_page_info_parallel(urls)

    # Print the page information for each URL
    for url, status_code, title in page_info_list:
        print(f"URL: {url}\nResponse Code: {status_code}\nTitle: {title}\n")

URL: http://336realty.com/
Response Code: 200
Title: None

URL: http://400realestate.com/
Response Code: 200
Title: ** Silver City Group **

URL: http://alexhayeshomes.com/
Response Code: 200
Title: ** Alex Hayes Homes **

URL: http://amyjamros-realtor.com
Response Code: -1
Title: HTTPConnectionPool(host='amyjamros-realtor.com', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000206E8DF22D0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

URL: http://a-statusrealty.com/
Response Code: 200
Title: None

URL: http://austin101realty.com/
Response Code: 200
Title: Friendly, Patient Austin Real Estate Agents | Austin 101 Realty

URL: http://azrealtorkatie.com/
Response Code: -1
Title: HTTPConnectionPool(host='azrealtorkatie.com', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000206E8D98210>: Failed to establish a n

In [12]:
import csv

# Save the page_info_list as a CSV file
with open("page_info.csv", "w", newline="") as csvfile:
    csv_writer = csv.writer(csvfile)
    # Write header row
    csv_writer.writerow(["URL", "Response Code", "Title"])
    # Write data rows
    csv_writer.writerows(page_info_list)

print("Data saved to page_info.csv")

Data saved to page_info.csv
