### Webscrapping code for apartments.com

The first code block is the packages and supporting functions for regex parsing.

The second code block is the web scrapping that goes through all the colleges, and pulls out the html, then sends it to be web scraped.

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
import re
import pandas as pd


# Custom function to extract value after dollar sign
def extract_value_after_dollar(row):
    # Find the index of the dollar sign
    dollar_index = row.find('$')
    # Extract everything after the dollar sign
    value_after_dollar = row[dollar_index + 1:]
    return value_after_dollar

def extract_average(html): 
    if re.search("404 Page Not Found", html):
        return None
    else: 
        pattern = r'aria-label="[^"]*">[^<]*<p class="property-pricing">[^<]*</p>'
        prices = re.findall(pattern, html)

        df = pd.DataFrame(prices, columns=['Text'])
        
        if len(df) < 5:
            return None

        # Separate the 'Text' column into 'AptName' and 'Price' columns
        df[['AptName', 'Price']] = df['Text'].str.split('>[^<]*<p', expand=True)

        # Clean 'AptName' and 'Price' columns
        df['AptName'] = df['AptName'].str.replace('aria-label="', '')
        df['AptName'] = df['AptName'].str.replace(r',[^,]*$', '')
        df['Price'] = df['Price'].str.replace('class="property-pricing">\\$', '').str.replace('</p>', '')

        # Separate 'Price' column into 'Price1' and 'Price2' columns
        df[['Price1', 'Price2']] = df['Price'].str.split(' - ', expand=True)

        # Drop the 'Text' and 'Price' columns
        df.drop(columns=['Text', 'Price'], inplace=True)
        df['Price2'] = pd.to_numeric(df['Price2'].str.replace(',', ''), errors='coerce')
        df['Price1'] = df['Price1'].apply(extract_value_after_dollar)
        df['Price1'] = pd.to_numeric(df['Price1'].str.replace(',', ''), errors='coerce')

        return((df['Price1'].mean() + df['Price2'].mean())/2)
#     return df

In [2]:
## https://www.youtube.com/watch?v=SPM1tm2ZdK4&t=855s

college_file_list = []
# college_file_list = ['jacksonville-al', 'auburn-alabama', 'ames-ia', 'college-station-tx']

with open('Collegetowns2.csv', 'r') as file:
    # Create a CSV reader object using DictReader
    for line in file:
        city = line[:-1].replace(',', '-').replace(' ', '-')
        college_file_list.append(city)
college_file_list = college_file_list[1:]

# Going to append this so that all the data stays in order
college_prices = pd.DataFrame(college_file_list, columns=["College"])
college_prices['Mean'] = None

options=Options()
options.add_experimental_option("detach", True)

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),
                          options=options )

i = 1
print("Firing up for loop")
for city in college_file_list:
    url = "https://www.apartments.com/" + city + "/max-1-bedrooms/"
#     print(url)
    driver.get(url)

    html = driver.page_source
    time.sleep(2)
    town_avg = extract_average(html)
    college_prices.loc[college_prices['College'] == city, 'Mean'] = town_avg
    i= i+1
    time.sleep(1)
    if i > 215:
        break

driver.close()

Firing up for loop


In [4]:
college_prices
# college_prices.to_csv('collegePrices.csv', index=False)

Unnamed: 0,College,Mean
0,Auburn-AL,1275.409091
1,Jacksonville-AL,
2,Livingston-AL,
3,Montevallo-AL,1291.97412
4,Troy-AL,
...,...,...
980,-,3075.755255
981,-,3075.755255
982,-,3075.755255
983,-,3075.755255


In [6]:
temp = college_prices.iloc[:213]

In [8]:
temp.to_csv('collegePrices.csv', index=False)

In [14]:
temp.count()

College    213
Mean        85
dtype: int64