# Palermo Forecast
### Procedures:
- Web scraping
- Cleaning data
- Data plot
- Machine Learning
- Tunning ML

## 1. Packages

In [7]:
import requests
import pandas as pd
import io
import os
import glob
from detect_delimiter import detect
from datetime import date


## 2. Web scraping 

The following code will extract the data in cvs format from the website "https://www.ilmeteo.it/portale/archivio-meteo/Palermo" from January 1973 to actual data. All cvs files will be extracted and stored locally. 

In [15]:
months=['Gennaio','Febbraio','Marzo','Aprile','Maggio','Giugno','Luglio','Agosto','Settembre','Ottobre','Novembre','Dicembre']
today = date.today()
for year in range(2022,(today.year-1)):
    for month in months:
        url="https://www.ilmeteo.it/portale/archivio-meteo/Palermo/"+str(year)+"/"+month+"?format=csv"
        response = requests.get(url, stream=True).content
        df_response=pd.read_csv(io.StringIO(response.decode('utf-8')), sep=';')
        df_response.to_csv('Palermo-'+str(year)+'-'+month+'.csv')
        

In [16]:
months=list(calendar.month_name[1:today.month-1])
print(months)
year= today.year
for month in months:
    url="https://www.ilmeteo.it/portale/archivio-meteo/Palermo/"+str(year)+"/"+month+"?format=csv"
    response = requests.get(url, stream=True).content
    print(io.StringIO(response.decode('utf-8')))
    df_response=pd.read_csv(io.StringIO(response.decode('utf-8')), sep=';')
    df_response.to_csv('Palermo-'+str(year)+'-'+month+'.csv')

  

['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September']
<_io.StringIO object at 0x00000254BCE748B0>


ParserError: Error tokenizing data. C error: Expected 1 fields in line 4, saw 2


## 3. Mearging data
Find the name of all csv files and combine the information into a single data frame. I also included the files from 2023

In [None]:
csv_files = glob.glob('*.{}'.format('csv'))
df_all= pd.DataFrame()

for file in csv_files:
    df_temp= pd.read_csv(file)
    df_all= pd.concat(df_all,df_temp, ignore_index=True)
    
print(df_all)

## 4. Cleaning data

In [None]:
weather= df_all.copy()
weather.apply(pd.isnull).sum()

core_weather=weather[['TMEDIA °C','TMIN °C','TMAX °C','PIOGGIA mm']].copy()

core_weather[pd.isnull(core_weather['TMEDIA °C'])]
core_weather=core_weather.fillna(method='ffill')

core_weather.index=pd.to_datetime(core_weather.index)
core_weather['TMEDIA °C']=pd.to_numeric(core_weather['TMEDIA °C'])
core_weather['TMIN °C']=pd.to_numeric(core_weather['TMIN °C'])
core_weather['TMAX °C']=pd.to_numeric(core_weather['TMAX °C'])
core_weather['PIOGGIA mm']=pd.to_numeric(core_weather['PIOGGIA mm'])

## 5. Data analysis

In [None]:
core_weather[['TMIN °C','TMAX °C']].plot()
core_weather.index.year.value_counts().sort_index()

core_weather["target"]= core_weather.shift(-1)['TMEDIA °C']
core_weather= core_weather.iloc[:-1,:].copy()

## 6. Machine Learning

In [None]:
from sklearn.linear_model import Ridge

def create_predictions(predictors, core_weather, reg):
    train = core_weather.loc[:"2022-12-31"]
    test = core_weather.loc["2023-01-01":]
    reg.fit(train[predictors], train["target"])
    Ridge(alpha=0.1)
    predictions = reg.predict(test[predictors])
    error= mean_absolute_error(test["target"], predictions)
    combined= pd.concat([test["target"],pd.Series(predictions,index=test.index)],axis=1)
    combined.columns=["actual","predictions"]
    return error, combined


reg= Ridge(alpha=.1)

predictors = ['TMEDIA °C','TMIN °C','TMAX °C','PIOGGIA mm']

error, combined = create_predictions(predictors, core_weather, reg)
print(error)

## 7. Tunning ML with extra data

In [None]:
core_weather["month max"]= core_weather['TMEDIA °C'].rolling(30).mean()
core_weather= core_weather.iloc[30:,:].copy()
core_weather["min_max"]= core_weather['TMIN °C']/ core_weather['TMAX °C']
core_weather['monthly_avg']= core_weather['TMEDIA °C'].groupby(core_weather.index.month).apply(lambda x: x.expanding(1).mean())
predictors=   ['TMEDIA °C','TMIN °C','TMAX °C','PIOGGIA mm','month max','monthly_avg']
error, combined = create_predictions(predictors, core_weather, reg)
print(error)

## 8. Variables analysis

In [None]:
reg.coef_

core_weather.corr()["target"]
combined["diff"]=(combined["actual"]-combined["predictions"]).abs()

combined.sort_values("diff",ascending=False).head()
