In [31]:
from sqlalchemy import create_engine, MetaData, Table, select, Column, Integer, String, Text, DECIMAL, DATE, DateTime
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime
import re
import json

# Database connection setup
engine = create_engine('mysql+mysqlconnector://kurt712:Gwyn-072022!@localhost/ncgov')
metadata = MetaData()

#  Define the greatschools table
greatschools_table = Table('greatschools', metadata, autoload_with=engine)

# Define the greatschools_history table
greatschools_history_table = Table('greatschools_history', metadata,
    Column('history_id', Integer, primary_key=True, autoincrement=True),
    Column('county_ID', String(4)),
    Column('school_url', Text),
    Column('students', Integer),
    Column('preschools', Integer),
    Column('elementary_schools', Integer),
    Column('middle_schools', Integer),
    Column('high_schools', Integer),
    Column('total_schools', Integer),
    Column('below_average', DECIMAL(5, 2)),
    Column('below_average_state', DECIMAL(5, 2)),
    Column('average', DECIMAL(5, 2)),
    Column('average_state', DECIMAL(5, 2)),
    Column('above_average', DECIMAL(5, 2)),
    Column('above_average_state', DECIMAL(5, 2)),
    Column('greatschools_update', DATE),
    Column('history_move_timestamp', DateTime)
)
# Create the tables in the database
metadata.create_all(engine)

# Fetching URLs from the database
with engine.connect() as connection:
    counties = Table('counties', metadata, autoload=True, autoload_with=engine)
    query = select([counties.columns.county_ID, counties.columns.greatschools_org])
    ResultProxy = connection.execute(query)
    URL_list = [(county_ID, url) for county_ID, url in ResultProxy.fetchall()]

# Define the scrape_data functions
def update_tables(engine, new_data):
    with engine.connect() as conn:
        for data in new_data:
            # Check if the county_ID already exists in greatschools
            existing = conn.execute(select([greatschools_table]).where(greatschools_table.columns.county_ID == data['county_ID'])).fetchone()
            
            if existing:
                # If exists, check for changes
                if any(data[key] != existing[key] for key in data.keys()):  # Iterate over data.keys()
                    # Move existing data to history
                    history_data = {key: existing[key] for key in existing.keys()}  # Iterate over existing.keys()
                    history_data['history_move_timestamp'] = datetime.now()
                    conn.execute(greatschools_history_table.insert(), history_data)

                    # Update the existing row in greatschools
                    conn.execute(greatschools_table.update().where(greatschools_table.columns.county_ID == data['county_ID']), data)
            else:
                # If not exists, insert new row
                conn.execute(greatschools_table.insert(), data)

def scrape_data(url, county_ID):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # Initialize variables to default values
    school_url = number_text = None
    students_number = 0
    number = 0
    pres_div_result = elms_div_result = mids_div_result = hghs_div_result = tots_div_result = 0
    ba1 = bas1 = a2 = as2 = aa3 = aas3 = None
    greatschools_update = None
        
    # Extracting highest-level county stats (its url and amount of students)
    
    # Find all DIVs with the class 'school-stats-item'
    divs_overview = soup.find_all('div', class_='school-stats-item')

    # Extract the 2nd, 3rd, and 4th divs
    second_div = divs_overview[1] if len(divs_overview) > 1 else None
    fourth_div = divs_overview[3] if len(divs_overview) > 3 else None

    # Now you can extract information from these divs as needed
    # For example, to get the text:
    if second_div:
        second_div_text = second_div.get_text()
        # Split the text by spaces and take the first part
        # Assuming the number is always the first part
        number_text = second_div_text.split()[0]

        # Remove commas and convert to integer
        students_number = int(number_text.replace(',', ''))
    if fourth_div:
        # Find the <a> tag within the fourth div
        a_tag = fourth_div.find('a')

        # Extract the URL from the href attribute
        if a_tag and 'href' in a_tag.attrs:
            school_url = a_tag['href']
        else:
            school_url = None  # or some default value or error handling

    # Extracting all school amounts
    # Initialize all variables to None
    pres_div = elms_div = mids_div = hghs_div = tots_div = number_text = None

    # Find all DIVs with the class 'school-stats-item'
    divs_schools = soup.find_all('div', class_='school-level-item')
        
    # Iterate over each div and assign based on the label
    for div in divs_schools:
        school_number = div.find('span', class_='school-number')
        if school_number and school_number.get_text():
            number_text = school_number.get_text().replace(',', '')  # Remove commas
            try:
                number = int(number_text)
            except ValueError:
                print(f"Error converting '{number_text}' to an integer.")
                continue

                # Process label text by targeting the link text
            link = div.find('a', class_='school-browse-link')
            if link:
                label_text = link.get_text().strip()[:5]  # Trim and get the first 5 characters
#                print(f"Extracted: {label_text}, Number: {number}")  # Debug print

                if label_text.startswith("Presc"):
                    pres_div_result = number
                elif label_text.startswith("Eleme"):
                    elms_div_result = number
                elif label_text.startswith("Middl"):
                    mids_div_result = number
                elif label_text.startswith("High "):
                    hghs_div_result = number
                elif label_text.startswith("Total"):
                    tots_div_result = number
                
#                print(f"Extracted: {label_text}, Number: {number}")  # Debug print

    # Now you can extract information from these divs as needed
    # For example, to get the text:
    if pres_div:
        pres_div_text = pres_div.get_text()
        print("Preschools div text:", pres_div_text)  # Debug print
        # Split the text by spaces and take the first part
        # Assuming the number is always the first part
        number_text = pres_div_text.split()[0]

        # Remove commas and convert to integer
        pres_div_result = int(number_text.replace(',', ''))
    if elms_div:
        elms_div_text = elms_div.get_text()
        print("Elementary div text:", elms_div_text)  # Debug print
        # Split the text by spaces and take the first part
        # Assuming the number is always the first part
        number_text = elms_div_text.split()[0]

        # Remove commas and convert to integer
        elms_div_result = int(number_text.replace(',', ''))
    if mids_div:
        mids_div_text = mids_div.get_text()
        print("Middle div text:", mids_div_text)  # Debug print
        # Split the text by spaces and take the first part
        # Assuming the number is always the first part
        number_text = mids_div_text.split()[0]

        # Remove commas and convert to integer
        mids_div_result = int(number_text.replace(',', ''))
    if hghs_div:
        hghs_div_text = hghs_div.get_text()
        print("High schools div text:", hghs_div_text)  # Debug print
        # Split the text by spaces and take the first part
        # Assuming the number is always the first part
        number_text = hghs_div_text.split()[0]

        # Remove commas and convert to integer
        hghs_div_result = int(number_text.replace(',', ''))
    if tots_div:
        tots_div_text = tots_div.get_text()
        print("Totals div text:", tots_div_text)  # Debug print
        # Split the text by spaces and take the first part
        # Assuming the number is always the first part
        number_text = tots_div_text.split()[0]

        # Remove commas and convert to integer
        tots_div_result = int(number_text.replace(',', ''))
            
    # Find the script tag with the JSON data
    script_tag = soup.find('script', {'class': 'js-react-on-rails-component', 'data-component-name': 'BarGraphUnified'})
    if script_tag:
        try:
            # Parse JSON content
            data = json.loads(script_tag.string)
            for item in data.get('data', []):
                key = item.get('key')
                value = item.get('value', 0)
                state_value = item.get('state_value', 0)

                if key == 'below_average':
                    ba1 = value / 100.0
                    bas1 = state_value / 100.0
                elif key == 'average':
                    a2 = value / 100.0
                    as2 = state_value / 100.0
                elif key == 'above_average':
                    aa3 = value / 100.0
                    aas3 = state_value / 100.0

            # Debug print for extracted averages
#            print("Extracted averages:", ba1, bas1, a2, as2, aa3, aas3)

        except json.JSONDecodeError:
            print("Error decoding JSON from script tag")

    # Extracting date from the string
    date_str = re.search(r'Summary Ratings for schools in this district were last updated on (.+?)\.', str(soup)).group(1)
    greatschools_update = datetime.strptime(date_str, '%B %d, %Y').date()

    # Return a dictionary of scraped data
    return {
        'county_ID': county_ID,  # This should be passed to the function or extracted within it
        'school_url': school_url,
        'students': students_number,
        'preschools': pres_div_result,
        'elementary_schools': elms_div_result,
        'middle_schools': mids_div_result,
        'high_schools': hghs_div_result,
        'total_schools': tots_div_result,
        'below_average': ba1,
        'below_average_state': bas1,
        'average': a2,
        'average_state': as2,
        'above_average': aa3,
        'above_average_state': aas3,
        'greatschools_update': greatschools_update,
        'timestamp': datetime.now()
    }

n = 0  # Initialize progress counter
scraped_data_list = []
for county_ID, url in URL_list:
    scraped_data = scrape_data(url, county_ID)  # Pass county_ID to the function
    scraped_data_list.append(scraped_data)
    n += 1
    print(f"Progress: {n}/{len(URL_list)}")

df = pd.DataFrame(scraped_data_list)
update_tables(engine, scraped_data_list)

# Assuming 'county_ID' is set as the primary key in your SQL table
df.to_sql('greatschools', con=engine, if_exists='replace', index=False)

Extracted: Presc, Number: 14
Extracted: Eleme, Number: 23
Extracted: Middl, Number: 11
Extracted: High , Number: 12
Extracted: Total, Number: 39
Progress: 1/100
Extracted: Presc, Number: 5
Extracted: Eleme, Number: 7
Extracted: Middl, Number: 2
Extracted: High , Number: 2
Extracted: Total, Number: 11
Progress: 2/100
Extracted: Presc, Number: 3
Extracted: Eleme, Number: 3
Extracted: Middl, Number: 3
Extracted: High , Number: 1
Extracted: Total, Number: 4
Progress: 3/100
Extracted: Eleme, Number: 6
Extracted: Middl, Number: 2
Extracted: High , Number: 4
Extracted: Total, Number: 11
Progress: 4/100
Extracted: Presc, Number: 1
Extracted: Eleme, Number: 3
Extracted: Middl, Number: 1
Extracted: High , Number: 2
Extracted: Total, Number: 6
Progress: 5/100
Extracted: Presc, Number: 5
Extracted: Eleme, Number: 5
Extracted: Middl, Number: 4
Extracted: High , Number: 4
Extracted: Total, Number: 11
Progress: 6/100
Extracted: Presc, Number: 4
Extracted: Eleme, Number: 8
Extracted: Middl, Number: 6


-1