<a href="https://colab.research.google.com/github/Faizanealiqazi/WeatherDataScrapper/blob/main/WeatherDataScrapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime, timedelta

In [6]:
dates = []
start_date = datetime.strptime("01-1942", "%m-%Y")
end_date = datetime.strptime("12-2023", "%m-%Y")
current_date = start_date

while current_date <= end_date:
    dates.append(current_date.strftime("%m-%Y"))
    if current_date.month == 12:
        current_date = current_date.replace(year=current_date.year + 1, month=1)
    else:
        current_date = current_date.replace(month=current_date.month + 1)

In [None]:
for date in dates:
    current_file = date+'.csv'
    print(f"Extracting for date: {date}")
    #url =''
    url = 'https://en.tutiempo.net/climate/' + date + '/ws-417800.html'
    print(f"Hitting url : {url}")

    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        style_map = soup.find_all('style')
        if len(style_map) > 1:
            style_content = str(style_map[1])
        else:
            print(f"No style map found for date: {date}")
            continue

        css_classes = re.findall(r'\.numspan span\.(nt\w+)::after\{content:"(.*?)";', style_content)
        replacement = {f'<span class="{cls}"></span>': f'<span class="{cls}">{content}</span>' for cls, content in css_classes}

        all_tables = soup.find_all('table')
        if len(all_tables) > 3:
            table = all_tables[3]
        else:
            print(f"No table found for date: {date}")
            continue

        html_str = str(table)

        for old, new in replacement.items():
            html_str = html_str.replace(old, new)

        updated_table = BeautifulSoup(html_str, 'html.parser')

        if updated_table:
            headers = [th.get_text(strip=True) for th in updated_table.find('tr').find_all('th')] if updated_table.find('tr') else []
            if not headers:
                print(f"No headers found for date: {date}")
                continue

            rows = []
            for row in updated_table.find_all('tr')[1:]:
                if row.th and row.th.has_attr('colspan'):
                    continue  # Skip this row
                cols = row.find_all('td')
                row_data = [td.get_text(strip=True) for td in cols]
                if row_data:
                    rows.append(row_data)

            df = pd.DataFrame(rows, columns=headers)
            df.to_csv(current_file, index=False)
        else:
            print(f"Could not update table for date: {date}")

    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error for date {date}: {e}")
    except requests.exceptions.RequestException as e:
        print(f"Request Exception for date {date}: {e}")
    except Exception as e:
        print(f"An unexpected error occurred for date {date}: {e}")


In [None]:
import os
df = []
merged_df = []
for date in dates:
    file_name = f"{date}.csv"
    print(f"Processing file: {file_name}")
    if not os.path.isfile(file_name):
        print(f"File {file_name} does not exist.")
        continue

    try:
        current_df = pd.read_csv(file_name)
        current_df = current_df[:-1]
        current_df['date'] = date
        merged_df.append(current_df)

    except pd.errors.EmptyDataError:
        print(f"No data in file {file_name}, skipping.")
    except pd.errors.ParserError:
        print(f"Parsing error in file {file_name}, skipping.")
    except Exception as e:
        print(f"An unexpected error occurred while processing {file_name}: {e}")

df = pd.concat(merged_df, ignore_index=True)

In [None]:
df['Day'] = df['Day'].astype(int).astype(str)
df['date'] = df['Day'] + '-' + df['date']
print(df)

In [None]:
df.tail(30)

In [None]:
rename_columns = {
    'T' : 'Average Temperature (°C)',
    'TM': 'Maximum temperature (°C)',
    'Tm': 'Minimum temperature (°C)',
    'SLP':'Atmospheric pressure at sea level (hPa)',
    'H' : 'Average relative humidity (%)',
    'PP': 'Total rainfall (mm)',
    'VV': 'Average visibility (Km)',
    'V' : 'Average wind speed (Km/h)',
    'VM': 'Maximum sustained wind speed (Km/h)',
    'VG': 'Maximum speed of wind (Km/h)',
    'RA': 'Total days it rained',
    'SN': 'Total days that snowed',
    'TS': 'Total days with thunderstorm',
    'FG': 'Total days with fog',
}
df.rename(columns=rename_columns, inplace=True)

In [23]:
df.to_csv('WeatherData1942-2023.csv', index=False)

In [None]:
!zip -r /content/WeatherData.zip /content/*.csv