In [None]:
%pip install beautifulsoup4
%pip install requests



In [None]:
import requests
import pandas as pd

from bs4 import BeautifulSoup

This is the prefix of every request.

In [None]:
prefix = 'https://www.ilmeteo.it/'

Define the web scraping logics. Particulare actions are explained via code comments.

In [None]:
final_year = 2025
current_year = 1973
scraped_data = []

next_month_link = prefix + 'portale/archivio-meteo/Bari/1973/Gennaio'

while current_year != final_year:
  # Generate the request
  r = requests.get(next_month_link)

  if r.status_code != 200:
    print(f'Error: {r}')
    break

  soup = BeautifulSoup(r.content, 'html.parser')

  # Find the table holding the data
  table = soup.find('table', id='table-meteo-archivio')
  rows = table.find_all('tr')

  # Get the link related to the next month in the calendar page
  next_month_link = prefix + soup.find_all('table')[0].find_all('a')[1]['href']
  print(f"Link of the next page: {next_month_link}")

  # Gets the date in the current page and extracts month and year
  date = soup.find_all('table')[0].find_all('td')[2].contents[0].split(' ')
  month_page = date[0]
  year_page = int(date[1])
  print(f'\tCurrent scraped month {month_page} of year {year_page}')

  # This are columns indices of the table, that indicates the features to get
  idxs = [1, 5, 6]

  for i, r in enumerate(rows):
    if i != 0:
      col = r.find_all('td')
      elements = [col[i].contents[0].split(' ')[0] for i in idxs]
      elements.append(month_page)
      elements.append(year_page)
      scraped_data.append(elements)

    # Current year represents the year found in the next page
    # and it is used to stop the while loop, when the value of this value
    # reaches the defined limit.
    current_year = int(next_month_link.split('/')[-2])

Link of the next page: https://www.ilmeteo.it//portale/archivio-meteo/Bari/1973/Febbraio
	Current scraped month GENNAIO of year 1973
Link of the next page: https://www.ilmeteo.it//portale/archivio-meteo/Bari/1973/Marzo
	Current scraped month FEBBRAIO of year 1973
Link of the next page: https://www.ilmeteo.it//portale/archivio-meteo/Bari/1973/Aprile
	Current scraped month MARZO of year 1973
Link of the next page: https://www.ilmeteo.it//portale/archivio-meteo/Bari/1973/Maggio
	Current scraped month APRILE of year 1973
Link of the next page: https://www.ilmeteo.it//portale/archivio-meteo/Bari/1973/Giugno
	Current scraped month MAGGIO of year 1973
Link of the next page: https://www.ilmeteo.it//portale/archivio-meteo/Bari/1973/Luglio
	Current scraped month GIUGNO of year 1973
Link of the next page: https://www.ilmeteo.it//portale/archivio-meteo/Bari/1973/Agosto
	Current scraped month LUGLIO of year 1973
Link of the next page: https://www.ilmeteo.it//portale/archivio-meteo/Bari/1973/Settemb

In [None]:
cols = ['temp_avg', 'wind_velocity', 'humidity', 'month', 'year']

df = pd.DataFrame(data=scraped_data, columns=cols) \
        .replace(to_replace='-', value=pd.NA)

Delete missing temperature values.

In [None]:
df = df[df['temp_avg'].isna() == False]
df.loc[:, 'temp_avg'] = df.loc[:, 'temp_avg'].astype(float)

Now we want to aggregate by month. Aggregating the risk is that the order of the time series gets lost (since we do not have a proper datetime attribute). The dictionary maps the month to a unique identifier, on which we want to aggregate and then sort.

In [None]:
months_dict = {
    'GENNAIO': 1,
    'FEBBRAIO': 2,
    'MARZO': 3,
    'APRILE': 4,
    'MAGGIO': 5,
    'GIUGNO': 6,
    'LUGLIO': 7,
    'AGOSTO': 8,
    'SETTEMBRE': 9,
    'OTTOBRE': 10,
    'NOVEMBRE': 11,
    'DICEMBRE': 12
}

Group the data by month.

In [None]:
df_grouped = df.groupby(['year', 'month'])['temp_avg'].mean().reset_index()
df_grouped['temp_avg'] = df_grouped['temp_avg'].astype(float).round(2)

Order the groups by year and month.

In [None]:
df_grouped['month_id'] = df_grouped['month'].map(months_dict)
df_grouped = df_grouped.sort_values(by=['year', 'month_id'])
df_grouped

Unnamed: 0,year,month,temp_avg,month_id
4,1973,GENNAIO,9.43,1
3,1973,FEBBRAIO,8.44,2
8,1973,MARZO,8.91,3
1,1973,APRILE,11.68,4
7,1973,MAGGIO,18.18,5
...,...,...,...,...
612,2024,AGOSTO,27.45,8
623,2024,SETTEMBRE,22.53,9
622,2024,OTTOBRE,19.32,10
621,2024,NOVEMBRE,13.17,11


In [None]:
df_grouped_piv = df_grouped.pivot_table(
                              index='year',
                              columns='month_id',
                              values='temp_avg',
                          ).reset_index()

In [None]:
df_grouped_piv

month_id,year,1,2,3,4,5,6,7,8,9,10,11,12
0,1973,9.43,8.44,8.91,11.68,18.18,21.82,24.69,23.19,21.89,18.0,12.11,9.77
1,1974,9.92,10.41,10.99,12.62,17.26,21.88,24.59,24.79,22.07,14.82,11.86,9.61
2,1975,8.25,7.82,11.95,13.53,18.18,21.36,23.85,23.59,22.6,17.16,12.16,9.95
3,1976,7.79,9.24,9.46,12.88,17.07,20.88,23.38,21.37,19.82,17.93,13.03,10.27
4,1977,9.95,11.96,13.26,13.88,18.56,21.46,25.08,23.99,19.76,16.63,13.22,9.48
5,1978,9.22,9.63,11.46,12.82,16.55,21.9,23.74,23.75,19.99,15.72,10.09,11.36
6,1979,7.88,10.15,12.39,12.34,17.67,22.79,24.65,23.37,19.92,18.07,12.64,10.99
7,1980,8.24,9.2,11.15,11.82,15.46,20.79,23.99,24.67,21.24,17.02,13.26,8.59
8,1981,6.97,7.58,12.07,14.48,18.25,22.74,23.95,24.42,22.13,18.17,10.57,10.49
9,1982,9.36,8.15,9.9,12.53,17.8,24.02,24.97,24.69,22.75,17.92,12.96,11.06


In [None]:
df_grouped_piv.to_csv('weather_data_aggregated.csv', sep='\t')

Calculate the mean temperature for each month, by a mean through the columns axis, and the average temperature in the interval [1996-2006].

In [None]:
mean_51_80 = 15.85
months_mean = df_grouped_piv.iloc[:, 1:13].mean(axis='columns')
interval_mean = df_grouped_piv.iloc[8:16, 1:13].mean().mean()

In [None]:
(interval_mean + mean_51_80) / 2

15.835364583333334

Put the fluctuations inside the dataframe, and retrive positive and negative ones.

In [None]:
df_grouped_piv['fluctuations'] = (months_mean - interval_mean).round(2)
df_grouped_piv['pos_fluctuations'] = df_grouped_piv['fluctuations'].map(lambda x: x if x > 0 else 0)
df_grouped_piv['neg_fluctuations'] = df_grouped_piv['fluctuations'].map(lambda x: x if x < 0 else 0)

Get and save fluctuations data.

In [None]:
fluctuations_data = df_grouped_piv[['year', 'pos_fluctuations', 'neg_fluctuations']]
fluctuations_data.to_csv('fluctuations_data.csv', sep=',')

Save to csv the DataFrame.

In [None]:
df_grouped.to_csv('weather_data_aggregated.csv', sep=',')