<a href="https://colab.research.google.com/github/Faizanealiqazi/WeatherDataScrapper/blob/main/WeatherDataScrapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime, timedelta

In [6]:
dates = []
start_date = datetime.strptime("01-1942", "%m-%Y")
end_date = datetime.strptime("12-2023", "%m-%Y")
current_date = start_date

while current_date <= end_date:
    dates.append(current_date.strftime("%m-%Y"))
    if current_date.month == 12:
        current_date = current_date.replace(year=current_date.year + 1, month=1)
    else:
        current_date = current_date.replace(month=current_date.month + 1)

In [5]:
for date in dates:
    current_file = date+'.csv'
    print(f"Extracting for date: {date}")
    #url =''
    url = 'https://en.tutiempo.net/climate/' + date + '/ws-417800.html'
    print(f"Hitting url : {url}")

    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        style_map = soup.find_all('style')
        if len(style_map) > 1:
            style_content = str(style_map[1])
        else:
            print(f"No style map found for date: {date}")
            continue

        css_classes = re.findall(r'\.numspan span\.(nt\w+)::after\{content:"(.*?)";', style_content)
        replacement = {f'<span class="{cls}"></span>': f'<span class="{cls}">{content}</span>' for cls, content in css_classes}

        all_tables = soup.find_all('table')
        if len(all_tables) > 3:
            table = all_tables[3]
        else:
            print(f"No table found for date: {date}")
            continue

        html_str = str(table)

        for old, new in replacement.items():
            html_str = html_str.replace(old, new)

        updated_table = BeautifulSoup(html_str, 'html.parser')

        if updated_table:
            headers = [th.get_text(strip=True) for th in updated_table.find('tr').find_all('th')] if updated_table.find('tr') else []
            if not headers:
                print(f"No headers found for date: {date}")
                continue

            rows = []
            for row in updated_table.find_all('tr')[1:]:
                if row.th and row.th.has_attr('colspan'):
                    continue  # Skip this row
                cols = row.find_all('td')
                row_data = [td.get_text(strip=True) for td in cols]
                if row_data:
                    rows.append(row_data)

            df = pd.DataFrame(rows, columns=headers)
            df.to_csv(current_file, index=False)
        else:
            print(f"Could not update table for date: {date}")

    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error for date {date}: {e}")
    except requests.exceptions.RequestException as e:
        print(f"Request Exception for date {date}: {e}")
    except Exception as e:
        print(f"An unexpected error occurred for date {date}: {e}")


Extracting for date: 01-1970
Hitting url : https://en.tutiempo.net/climate/01-1970/ws-417800.html
HTTP Error for date 01-1970: 404 Client Error: Not Found for url: https://en.tutiempo.net/climate/01-1970/ws-417800.html
Extracting for date: 02-1970
Hitting url : https://en.tutiempo.net/climate/02-1970/ws-417800.html
HTTP Error for date 02-1970: 404 Client Error: Not Found for url: https://en.tutiempo.net/climate/02-1970/ws-417800.html
Extracting for date: 03-1970
Hitting url : https://en.tutiempo.net/climate/03-1970/ws-417800.html
HTTP Error for date 03-1970: 404 Client Error: Not Found for url: https://en.tutiempo.net/climate/03-1970/ws-417800.html
Extracting for date: 04-1970
Hitting url : https://en.tutiempo.net/climate/04-1970/ws-417800.html
HTTP Error for date 04-1970: 404 Client Error: Not Found for url: https://en.tutiempo.net/climate/04-1970/ws-417800.html
Extracting for date: 05-1970
Hitting url : https://en.tutiempo.net/climate/05-1970/ws-417800.html
HTTP Error for date 05-197

In [10]:
import os
df = []
merged_df = []
for date in dates:
    file_name = f"{date}.csv"
    print(f"Processing file: {file_name}")
    if not os.path.isfile(file_name):
        print(f"File {file_name} does not exist.")
        continue

    try:
        current_df = pd.read_csv(file_name)
        current_df = current_df[:-1]
        current_df['date'] = date
        merged_df.append(current_df)

    except pd.errors.EmptyDataError:
        print(f"No data in file {file_name}, skipping.")
    except pd.errors.ParserError:
        print(f"Parsing error in file {file_name}, skipping.")
    except Exception as e:
        print(f"An unexpected error occurred while processing {file_name}: {e}")

df = pd.concat(merged_df, ignore_index=True)

Processing file: 01-1942.csv
File 01-1942.csv does not exist.
Processing file: 02-1942.csv
File 02-1942.csv does not exist.
Processing file: 03-1942.csv
File 03-1942.csv does not exist.
Processing file: 04-1942.csv
File 04-1942.csv does not exist.
Processing file: 05-1942.csv
Processing file: 06-1942.csv
Processing file: 07-1942.csv
Processing file: 08-1942.csv
Processing file: 09-1942.csv
Processing file: 10-1942.csv
Processing file: 11-1942.csv
Processing file: 12-1942.csv
Processing file: 01-1943.csv
Processing file: 02-1943.csv
Processing file: 03-1943.csv
Processing file: 04-1943.csv
Processing file: 05-1943.csv
Processing file: 06-1943.csv
Processing file: 07-1943.csv
Processing file: 08-1943.csv
Processing file: 09-1943.csv
Processing file: 10-1943.csv
Processing file: 11-1943.csv
Processing file: 12-1943.csv
Processing file: 01-1944.csv
Processing file: 02-1944.csv
Processing file: 03-1944.csv
Processing file: 04-1944.csv
Processing file: 05-1944.csv
Processing file: 06-1944.cs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  current_df['date'] = date


Processing file: 09-1998.csv
Processing file: 10-1998.csv
Processing file: 11-1998.csv
Processing file: 12-1998.csv
Processing file: 01-1999.csv
Processing file: 02-1999.csv
Processing file: 03-1999.csv
Processing file: 04-1999.csv
Processing file: 05-1999.csv
Processing file: 06-1999.csv
Processing file: 07-1999.csv
Processing file: 08-1999.csv
Processing file: 09-1999.csv
Processing file: 10-1999.csv
Processing file: 11-1999.csv
Processing file: 12-1999.csv
Processing file: 01-2000.csv
Processing file: 02-2000.csv
File 02-2000.csv does not exist.
Processing file: 03-2000.csv
Processing file: 04-2000.csv
Processing file: 05-2000.csv
Processing file: 06-2000.csv
Processing file: 07-2000.csv
Processing file: 08-2000.csv
Processing file: 09-2000.csv
Processing file: 10-2000.csv
Processing file: 11-2000.csv
Processing file: 12-2000.csv
Processing file: 01-2001.csv
Processing file: 02-2001.csv
Processing file: 03-2001.csv
Processing file: 04-2001.csv
Processing file: 05-2001.csv
Processing

In [15]:
df.to_csv('WeatherData1942-2023.csv', index=False)