In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Mount Google Drive for saving the output file
from google.colab import drive
drive.mount('/content/drive')

# Initialize an empty DataFrame to store the combined results
combined_df = pd.DataFrame()

# Function to check if there is a second page and get the link to it
def has_next_page(soup):
    """
    Checks if the webpage has a link to a second page.

    :param soup: Parsed HTML content of the webpage.
    :return: True if there is a second page, False otherwise.
    """
    pagination_links = soup.find_all('a', string='Go to Page 2')
    return len(pagination_links) > 0

# Loop through all the years from 1934 to 2017
for year in range(1934, 2018):
    # Loop through the pages for a year (first page is '', second page is 'b')
    page_suffixes = ['', 'b']  # Page 1 is '', Page 2 is 'b'
    for page_suffix in page_suffixes:
        # URL of the webpage to scrape for the specific year and page
        url = f"https://www.totalmotorcycle.com/MotorcycleFuelEconomyGuide/{year}{page_suffix}-MPG?d=1"

        # Send an HTTP request to the webpage
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Locate the table using the specified selector or fallback method
        table = soup.select_one('#Layer3 > div:nth-child(8) > table')
        if not table:
            table = soup.find('table', {'border': '4'})

        # If table is still not found, skip this page
        if not table:
            print(f"Table not found for year {year}{page_suffix}, skipping...")
            continue

        # Extract the table headers and rows
        headers = [th.get_text(strip=True) for th in table.find_all('th')]
        rows = []
        for row in table.find_all('tr'):
            cols = [col.get_text(strip=True) for col in row.find_all(['td', 'th'])]
            rows.append(cols)

        # Convert the data to a pandas DataFrame
        df_year = pd.DataFrame(rows[1:], columns=rows[0])

        # Add a new column for the year and page
        df_year['Year'] = f"{year}{page_suffix}"

        # Append the DataFrame for this year and page to the combined DataFrame
        combined_df = pd.concat([combined_df, df_year], ignore_index=True)

        # Check if there's a second page
        if page_suffix == '' and not has_next_page(soup):
            # If there is no second page, break the loop and move to the next year
            break

# Remove duplicate rows from the combined DataFrame
combined_df.drop_duplicates(inplace=True)

# File Path for saving the output
file_path = '/content/drive/MyDrive/Master Paper/Data/'

# Save the combined DataFrame to a CSV file
output_file = file_path + 'motorcycle_fuel_economy_1934_to_2017.csv'
combined_df.to_csv(output_file, index=False)

print(f"Data saved to {output_file}")

# Rename columns to meaningful Lithuanian names
combined_df = combined_df.rename(columns={
    'Year': 'metai',
    'Manufacturer': 'cleaned_marke',
    'Model': 'modelis',
    'Engine Size (cc) /Cyl': 'variklio_turis_ir_cilindrai',
    'Average MPG': 'vidutines_mpg',
    'Average L/100km': 'vidutines_l_100km',
    'Source': 'saltinis'
})

# Split the 'variklio_turis_ir_cilindrai' column into 'variklio_turis_cm3' and 'cilindrai'
combined_df[['variklio_turis_cm3', 'cilindrai']] = combined_df['variklio_turis_ir_cilindrai'].str.split('/', expand=True)

# Clean up fuel consumption columns by removing units
combined_df['vidutines_l_100km'] = combined_df['vidutines_l_100km'].str.replace(' L/100km', '', regex=False)
combined_df['vidutines_mpg'] = combined_df['vidutines_mpg'].str.replace(' MPG', '', regex=False)

# Drop the original 'variklio_turis_ir_cilindrai' column as it's no longer needed
combined_df = combined_df.drop(columns=['variklio_turis_ir_cilindrai', 'Engine Siz(cc) /Cyl'])

# Save the cleaned combined DataFrame to a CSV file
output_file = file_path + 'moto_makers_models_fuel_consumptions_cleaned.csv'
combined_df.to_csv(output_file, index=False)

print(f"Cleaned data saved to {output_file}")
# Display the cleaned DataFrame
print(combined_df.head())



