In [194]:
import pdfplumber
import pandas as pd
import os
import re
state_names = [
    "ALABAMA", "ALASKA", "ARIZONA", "ARKANSAS", "CALIFORNIA", "COLORADO", "CONNECTICUT", "DELAWARE", "FLORIDA", 
    "GEORGIA", "HAWAII", "IDAHO", "ILLINOIS", "INDIANA", "IOWA", "KANSAS", "KENTUCKY", "LOUISIANA", "MAINE", 
    "MARYLAND", "MASSACHUSETTS", "MICHIGAN", "MINNESOTA", "MISSISSIPPI", "MISSOURI", "MONTANA", "NEBRASKA", 
    "NEVADA", "NEW HAMPSHIRE", "NEW JERSEY", "NEW MEXICO", "NEW YORK", "NORTH CAROLINA", "NORTH DAKOTA", "OHIO", 
    "OKLAHOMA", "OREGON", "PENNSYLVANIA", "RHODE ISLAND", "SOUTH CAROLINA", "SOUTH DAKOTA", "TENNESSEE", "TEXAS", 
    "UTAH", "VERMONT", "VIRGINIA", "WASHINGTON", "WEST VIRGINIA", "WISCONSIN", "WYOMING"
]

#sm_states are for 2020 and 2021 which are vastly different to the other data sets

sm_state_names = ['Alaska', 'Alabama','Arkansas','Arizona', 'California', 'Colorado', 'Connecticut', 'District of Columbia', 
                     'Delaware','Florida','Georgia','Hawaii','Iowa','Idaho','Illinois','Indiana','Kansas','Kentucky','Louisiana', 'Massachusetts',
                     'Maryland', 'Maine', 'Michigan', 'Minnesota', 'Missouri', 'Mississippi', 'Montana', 'North Carolina', 'North Dakota',
                     'Nebraska', 'New Hampshire', 'New Jersey', 'New Mexico', 'Nevada', 'New York', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
                     'Rhode Island', 'South Carolina','South Dakota', 'Tennessee', 'Texas', 'Utah', 'Virginia', 'Vermont', 'Washington', 'Wisconsin',
                     'West Virginia', 'Wyoming']
                     
def extract_table_from_pdf_plumber(pdf_path):
    # Open the PDF using pdfplumber
    with pdfplumber.open(pdf_path) as pdf:
        
        # Find the start and end pages of the table
        start_page = None
        end_page = None
        for i, page in enumerate(pdf.pages):
            text = page.extract_text()
            if "ALABAMA" in text or ("HOUSE" in text and "Alabama" in text) or ("Alabama" in text):
                start_page = i
            if "WYOMING" in text or "Wyoming" in text:
                end_page = i
                break
        
        # If start or end page is not found, return an empty DataFrame
        if start_page is None or end_page is None:
            return pd.DataFrame()
        
        # Extract text from the identified range of pages
        text = ""
        for page_num in range(start_page, end_page + 1):
            text += pdf.pages[page_num].extract_text()

    # Split the text into lines
    lines = text.split('\n')
   
    # Initialize lists to store data
    seats = []
    states = []
    names = []
    lq_scores = []

    # Variables to keep track of current state
    current_state = None

    # Iterate through the lines to extract data
    for line in lines:

        index = lines.index(line)        
        
        
      # This is for 2010-2019 to set the current state                  

        if any(state in line for state in state_names):
            current_state = line.replace("(cont.)", "").strip()
            
        

        #Fixing bugs for 2021.
        
        if line.count('%') > 1:
            index = lines.index(line)
            temp_line = line.split('%')
            temp_line[0] = temp_line[0] + '%'
            temp_line[1] = temp_line[1] + '%'
            lines[index] = temp_line[0]
            lines.insert(index + 1, temp_line[1])
            
        elif any(state in line for state in sm_state_names):
            if '%' in line:
                if '1' in line:
                    temp_line = line.split('1 ')
                    temp_line[1] = "1 " + temp_line[1]
                    lines[index] = temp_line[0]
                    lines.insert(index + 1, temp_line[1])
                    current_state = temp_line[0].upper()
                    
                else:
                    temp_line = line.split('%')
                    temp_line[0] = temp_line[0] + '%'
                    lines.insert(index + 1, temp_line[0])
                    lines.insert(index + 2, temp_line[1])

            elif 'Congressional' in line: 
                continue
                
            else:
                current_state = line.upper()

        #Here we deal with 2010-2019
        
        elif any(char.isdigit() for char in line) and "%" in line:
            parts = line.split()
            seats.append(parts[0])
            names.append(' '.join(parts[1:-1]))
            lq_scores.append(parts[-1])
            #upper_current_state = current_state.upper()
            states.append(current_state)

        # Here I deal with pdf 2020 - a mess of a pdf. Lots of misread lines etc.
        
        if current_state == 'Delaware' and 'AL D' in line:
            line = 'AL Blunt Rochester D + + + + + + + + + + + + - + + + + - + + 90'

        elif current_state == 'South Carolina' and '3 R' in line:
            line = '3 Duncan Jeff R - - - - - - - - - - - - + - X - - - - - 5'
            
        elif current_state == 'Ohio' and '14 R' in line:
            line = '14 Joyce David R - + - - + - - - - - - + - - - - - - - - 15'
            
        elif current_state == 'Florida' and '23 D' in line:
            line ='23 Wasserman Schultz D + + + + + + + + + + + + - + + + - - + + 85'
            
        elif current_state == 'California' and '5 D' in line:
            line = '5 Thompson D + + + + + + + + + + + + - + + + + - + + 90'
            
        elif current_state == 'California' and '40 D' in line:
            line = '40 Roybal-Allard D + + + + + + + + + + + + - X + + - - + + 80'

        elif current_state == 'California' and '43 D' in line:
            line = '43 Waters Maxine D + + + + + + + + + + + X + + + + + + + + 95'
            
        elif current_state == 'Florida' and '26 D' in line:
            line = '26 Mucarsel Powell D + + + + + + + + + + + + - + + + - - + + 85'
            
        elif current_state == 'Massachusetts' and '4 D' in line:
            line = '4 Kennedy Joseph P. D + + + + + + + + + + + + + + + + + + + + 100'

        elif current_state == 'Pennsylvania' and '15 R' in line:
            line = '15 Thompson G. R - - - - + - - - - - - - - - - - - - - - 5'

        elif current_state == 'Nebraska' and '3 R' in line:
            line = '3 Smith Adrian R - - - - + - - - - - - - - - - - - - - - 5'

        elif current_state == 'New Jersey' and '12 D' in line:
            line = '12 Watson Coleman D + + + + + + + + + + + + + + + + + + + + 100'

        elif current_state == 'New Mexico' and '2 D' in line:
            line = '2 Torres Small D + + + + + + + + + + + + - - + + - - + + 80'

        elif current_state == 'New York' and '14 D' in line:
            line = "14 Ocasio Cortez D + + + + + + + + + + + + + + + + + + + + 100"

        elif current_state == 'New York' and '18 D' in line:
            line = '18 Maloney S.P. D + + + + + + + + + + + + - + + + - - + + 85'

        elif current_state == 'Washington' and '3 R' in line:
            line = '3 Herrera Beutler R - + - - + - - - - - - X + - - - - - + - 20'
            
        elif current_state == 'Texas' and '30 D' in line:
            line = '30 Johnson E.B. D + + + + + + + + + + + + - + + + + + + + 95'

        elif current_state == 'Mississippi' and '2 D' in line:
            line = '2 Thompson B. D + + + + + + + + + + - + - + + + + + + + 90'


        if any(char.isdigit() for char in line) and (" R " in line or " D " in line) and "%" not in line:
            if current_state == None:
                continue
            parts = line.split()
            seats.append(parts[0])
            names.append(' '.join(parts[1:-1]))
            lq_scores.append(parts[-1])
            #upper_current_state = current_state.upper()
            states.append(current_state)


    # Create a pandas DataFrame
    df = pd.DataFrame({
        'Seat': seats,
        'State': states,
        'Name': names,
        'LQ Score': lq_scores
    })
    # Extract the year from the file name
    year = os.path.basename(pdf_path).split('.')[0]
    df['Year'] = year

    # Remove the '%' symbol, fill NaN values with 0, and then convert to integer
    df['LQ Score'] =  df['LQ Score'].str.extract('(\d+)').fillna(0).astype(int)

    # Clean up the 'Name' column by removing any unwanted characters
    df['Name'] = df['Name'].str.replace('[^a-zA-Z\s.]|X', '', regex=True).str.strip()
    df['Name'] = df['Name'].str.replace('A Newsletter for Liberal', '', regex=False).str.strip()

    return df

# Example usage remains the same as in your code.


In [196]:
# Example usage:
base_path = "/Users/jorgebruno/Downloads/Learning Metrics/downloaded_pdfs"
pdf_paths = [os.path.join(base_path, f"{year}.pdf") for year in range(2020, 2021)]  # List of paths to your PDF files
dfs = []  # List to store DataFrames for each PDF

for pdf_path in pdf_paths:
    dfs.append(extract_table_from_pdf_plumber(pdf_path))

# Combine all DataFrames into one
final_df = pd.concat(dfs, ignore_index=True)
pd.set_option('display.max_rows', 1000)
final_df



Unnamed: 0,Seat,State,Name,LQ Score,Year
0,1,ALABAMA,Byrne R,0,2020
1,2,ALABAMA,Roby R,5,2020
2,3,ALABAMA,Rogers M. R,5,2020
3,4,ALABAMA,Aderholt R,0,2020
4,5,ALABAMA,Brooks M. R,10,2020
5,6,ALABAMA,Palmer R,0,2020
6,7,ALABAMA,Sewell D,85,2020
7,AL,ALASKA,Young Don R,35,2020
8,1,ARIZONA,OHalleran D,85,2020
9,2,ARIZONA,Kirkpatrick D,80,2020
