In [20]:
!pip install --user -r requirements.txt

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0m

In [21]:
import json
from WorldWeatherPy import DetermineListOfAttributes
from WorldWeatherPy import RetrieveByAttribute
import pandas as pd
from sklearn.preprocessing import StandardScaler
from joblib import dump
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [22]:
def parametres(out_file):
    
    api_key = 'bd35020cdd3643f4b69142436222912'
    attributes = ['date','time', 'moon_illumination', 
              'tempC', 'tempF', 'windspeedMiles', 'windspeedKmph', 'winddirDegree', 'weatherCode',
              'weatherDesc', 'precipMM', 'precipInches', 'humidity', 'visibility', 'visibilityMiles', 
              'pressure', 'pressureInches', 'cloudcover', 'HeatIndexC', 'HeatIndexF', 'DewPointC', 'DewPointF', 'WindChillC', 
              'WindChillF', 'WindGustMiles', 'WindGustKmph', 'FeelsLikeC', 'FeelsLikeF', 'uvIndex']
    
    conditions = ['Sunny','Clear','Cloudy','Rain','Snow']
    location_list = ['milan','turin','florence','bologna','rome','naples','palermo']

    frequency = 6
    start_date = '2018-1-1' 
    end_date = '2023-1-1'
    
    parametres = {'api_key' : api_key,
            'attributes' : attributes,
            'conditions' : conditions,
            'location_list' : location_list,
            'frequency' : frequency,
            'start_date' : start_date,
            'end_date' : end_date}

    # Creates a json object based on `parametres`
    parametres_json = json.dumps(parametres)


    # Saves the json object into a file
    with open(out_file, 'w') as f:
        json.dump(parametres_json, f)

In [23]:
def download_milan_data(parametres_file):
    
    # Open and reads file "parametres"
    with open(parametres_file) as f:
        parametres = json.load(f)
    
    
    # The excted data type is 'dict', however since the file
    # was loaded as a json object, it is first loaded as a string
    # thus we need to load again from such string in order to get 
    # the dict-type object.
    parametres = json.loads(parametres)
    
    location = parametres['location_list'][0] #milan
    
    dataset = RetrieveByAttribute(parametres['api_key'], parametres['attributes'], location, 
                                  parametres['start_date'], parametres['end_date'], parametres['frequency']).retrieve_hist_data()
    dataset.to_csv(f'data/{location}.csv', encoding='utf-8', index=False)

    data = pd.read_csv(f'data/{location}.csv')
    tmp = []
    
    conditions = parametres['conditions']
    
    for i, row in data.iterrows():
        found = False
        for j in conditions:
            if j.lower() in row['weatherDesc'].lower():
                data.at[i, 'weatherDesc'] = j
                found = True
                break
        if not found:
                tmp.append(i)

    data = data.drop(tmp)
    data = data.drop_duplicates()
    data = data.reset_index(drop=True)
    data.to_csv(f'data/{location}.csv', index=False)


In [24]:
def download_turin_data(parametres_file):
    
    # Open and reads file "parametres"
    with open(parametres_file) as f:
        parametres = json.load(f)
    
    
    # The excted data type is 'dict', however since the file
    # was loaded as a json object, it is first loaded as a string
    # thus we need to load again from such string in order to get 
    # the dict-type object.
    parametres = json.loads(parametres)
    
    location = parametres['location_list'][1] # turin
    
    dataset = RetrieveByAttribute(parametres['api_key'], parametres['attributes'], location, 
                                  parametres['start_date'], parametres['end_date'], parametres['frequency']).retrieve_hist_data()
    dataset.to_csv(f'data/{location}.csv', encoding='utf-8', index=False)

    data = pd.read_csv(f'data/{location}.csv')
    tmp = []
    conditions = parametres['conditions']
    
    for i, row in data.iterrows():
        found = False
        for j in conditions:
            if j.lower() in row['weatherDesc'].lower():
                data.at[i, 'weatherDesc'] = j
                found = True
                break
        if not found:
                tmp.append(i)
            
    data = data.drop(tmp)
    data = data.drop_duplicates()
    data = data.reset_index(drop=True)
    data.to_csv(f'data/{location}.csv', index=False)

In [25]:
def download_florence_data(parametres_file):
    
    # Open and reads file "parametres"
    with open(parametres_file) as f:
        parametres = json.load(f)
    
    
    # The excted data type is 'dict', however since the file
    # was loaded as a json object, it is first loaded as a string
    # thus we need to load again from such string in order to get 
    # the dict-type object.
    parametres = json.loads(parametres)
    
    location = parametres['location_list'][2] # florence
    
    dataset = RetrieveByAttribute(parametres['api_key'], parametres['attributes'], location, 
                                  parametres['start_date'], parametres['end_date'], parametres['frequency']).retrieve_hist_data()
    dataset.to_csv(f'data/{location}.csv', encoding='utf-8', index=False)

    data = pd.read_csv(f'data/{location}.csv')
    tmp = []
    conditions = parametres['conditions']
    
    for i, row in data.iterrows():
        found = False
        for j in conditions:
            if j.lower() in row['weatherDesc'].lower():
                data.at[i, 'weatherDesc'] = j
                found = True
                break
        if not found:
                tmp.append(i)
            
    data = data.drop(tmp)
    data = data.drop_duplicates()
    data = data.reset_index(drop=True)
    data.to_csv(f'data/{location}.csv', index=False)

In [26]:
def download_bologna_data(parametres_file):
    
    # Open and reads file "parametres"
    with open(parametres_file) as f:
        parametres = json.load(f)
    
    
    # The excted data type is 'dict', however since the file
    # was loaded as a json object, it is first loaded as a string
    # thus we need to load again from such string in order to get 
    # the dict-type object.
    parametres = json.loads(parametres)
    
    location = parametres['location_list'][3] # bologna
    
    dataset = RetrieveByAttribute(parametres['api_key'], parametres['attributes'], location, 
                                  parametres['start_date'], parametres['end_date'], parametres['frequency']).retrieve_hist_data()
    dataset.to_csv(f'data/{location}.csv', encoding='utf-8', index=False)

    data = pd.read_csv(f'data/{location}.csv')
    tmp = []
    conditions = parametres['conditions']
    
    for i, row in data.iterrows():
        found = False
        for j in conditions:
            if j.lower() in row['weatherDesc'].lower():
                data.at[i, 'weatherDesc'] = j
                found = True
                break
        if not found:
                tmp.append(i)

    data = data.drop(tmp)
    data = data.drop_duplicates()
    data = data.reset_index(drop=True)
    data.to_csv(f'data/{location}.csv', index=False)

In [27]:
def download_rome_data(parametres_file):
    
    # Open and reads file "parametres"
    with open(parametres_file) as f:
        parametres = json.load(f)
    
    
    # The excted data type is 'dict', however since the file
    # was loaded as a json object, it is first loaded as a string
    # thus we need to load again from such string in order to get 
    # the dict-type object.
    parametres = json.loads(parametres)
    
    location = parametres['location_list'][4] # rome
    
    dataset = RetrieveByAttribute(parametres['api_key'], parametres['attributes'], location, 
                                  parametres['start_date'], parametres['end_date'], parametres['frequency']).retrieve_hist_data()
    dataset.to_csv(f'data/{location}.csv', encoding='utf-8', index=False)

    data = pd.read_csv(f'data/{location}.csv')
    tmp = []
    conditions = parametres['conditions']
    
    for i, row in data.iterrows():
        found = False
        for j in conditions:
            if j.lower() in row['weatherDesc'].lower():
                data.at[i, 'weatherDesc'] = j
                found = True
                break
        if not found:
                tmp.append(i)
            
    data = data.drop(tmp)
    data = data.drop_duplicates()
    data = data.reset_index(drop=True)
    data.to_csv(f'data/{location}.csv', index=False)

In [28]:
def download_naples_data(parametres_file):
    
    # Open and reads file "parametres"
    with open(parametres_file) as f:
        parametres = json.load(f)
    
    
    # The excted data type is 'dict', however since the file
    # was loaded as a json object, it is first loaded as a string
    # thus we need to load again from such string in order to get 
    # the dict-type object.
    parametres = json.loads(parametres)
    
    location = parametres['location_list'][5] # naples
    
    dataset = RetrieveByAttribute(parametres['api_key'], parametres['attributes'], location, 
                                  parametres['start_date'], parametres['end_date'], parametres['frequency']).retrieve_hist_data()
    dataset.to_csv(f'data/{location}.csv', encoding='utf-8', index=False)

    data = pd.read_csv(f'data/{location}.csv')
    tmp = []
    conditions = parametres['conditions']
    
    for i, row in data.iterrows():
        found = False
        for j in conditions:
            if j.lower() in row['weatherDesc'].lower():
                data.at[i, 'weatherDesc'] = j
                found = True
                break
        if not found:
                tmp.append(i)

    data = data.drop(tmp)
    data = data.drop_duplicates()
    data = data.reset_index(drop=True)
    data.to_csv(f'data/{location}.csv', index=False)

In [29]:
def download_palermo_data(parametres_file):
    
    # Open and reads file "parametres"
    with open(parametres_file) as f:
        parametres = json.load(f)
    
    
    # The excted data type is 'dict', however since the file
    # was loaded as a json object, it is first loaded as a string
    # thus we need to load again from such string in order to get 
    # the dict-type object.
    parametres = json.loads(parametres)
    
    location = parametres['location_list'][6] # palermo
    
    dataset = RetrieveByAttribute(parametres['api_key'], parametres['attributes'], location, 
                                  parametres['start_date'], parametres['end_date'], parametres['frequency']).retrieve_hist_data()
    dataset.to_csv(f'data/{location}.csv', encoding='utf-8', index=False)

    data = pd.read_csv(f'data/{location}.csv')
    tmp = []
    conditions = parametres['conditions']
    
    for i, row in data.iterrows():
        found = False
        for j in conditions:
            if j.lower() in row['weatherDesc'].lower():
                data.at[i, 'weatherDesc'] = j
                found = True
                break
        if not found:
                tmp.append(i)
                
    data = data.drop(tmp)
    data = data.drop_duplicates()
    data = data.reset_index(drop=True)
    data.to_csv(f'data/{location}.csv', index=False)

In [30]:
def merge_data(parametres_file):
    
    download_milan_data(parametres_file)
    download_turin_data(parametres_file)
    download_florence_data(parametres_file)
    download_bologna_data(parametres_file)
    download_rome_data(parametres_file)
    download_naples_data(parametres_file)
    download_palermo_data(parametres_file)
    
    # Open and reads file "parametres"
    with open(parametres_file) as f:
        parametres = json.load(f)
        
    parametres = json.loads(parametres)
    
    location_list = parametres['location_list']
    #data = [None] * len(location_list)

    #for i, location in enumerate(location_list):
        #data[i] = pd.read_csv(f'datavol-1/data/{location}.csv')
        
    data = [pd.read_csv(f'data/{location}.csv') for location in location_list]    
    
    
    # creazione della tabella finale utilizzando il metodo concat()
    merged_table = pd.concat([data[0], data[1], data[2], data[3], data[4], data[5], data[6]])
    merged_table[ ['date','time', 'moon_illumination', 
              'tempC', 'tempF', 'windspeedMiles', 'windspeedKmph', 'winddirDegree', 'weatherCode', 
              'weatherDesc', 'precipMM', 'precipInches', 'humidity', 'visibility', 'visibilityMiles', 
              'pressure', 'pressureInches', 'cloudcover', 'HeatIndexC', 'HeatIndexF', 'DewPointC', 'DewPointF', 'WindChillC', 
              'WindChillF', 'WindGustMiles', 'WindGustKmph', 'FeelsLikeC', 'FeelsLikeF', 'uvIndex']] = merged_table[ ['date','time','moon_illumination', 
                                                                                                                      'tempC', 'tempF', 'windspeedMiles', 'windspeedKmph', 'winddirDegree', 'weatherCode', 
                                                                                                                      'weatherDesc', 'precipMM', 'precipInches', 'humidity', 'visibility', 'visibilityMiles', 
                                                                                                                      'pressure', 'pressureInches', 'cloudcover', 'HeatIndexC', 'HeatIndexF', 'DewPointC', 'DewPointF', 'WindChillC', 
                                                                                                                      'WindChillF', 'WindGustMiles', 'WindGustKmph', 'FeelsLikeC', 'FeelsLikeF', 'uvIndex']].replace(-0,0)
    merged_table = merged_table.reset_index(drop=True)
    merged_table.to_csv('data/merged_table.csv', index=False)
    
    

In [31]:
def data_preprocessing():
    # lettura di un file CSV in un DataFrame
    df = pd.read_csv('data/merged_table.csv')

    #df = df.drop(['date','time'], axis=1) #La città potrebbe essere una feature più importante, poiché il meteo può variare significativamente da una città all'altra
    df = df.drop(['date','time','city'], axis=1) # Tuttavia, se si utilizza un modello di Random Forest su un dataset di diverse città, il modello potrebbe essere in grado di apprendere autonomamente queste differenze e quindi la colonna 'city' potrebbe non essere necessaria.

    y = df[['weatherDesc']]
    x = df.drop(['weatherDesc'], axis=1)
    columns = x.columns

    # Standardize the data in X using a scaler
    scaler = StandardScaler()
    scaler.fit(x)
    X = scaler.transform(x)
    features = pd.DataFrame(X, columns = columns)
    
    dump(scaler, 'models/scaler.joblib')
    
    return features, y


In [32]:
def hyperparametres(out_file):
    
    n_estimators = 400
    bootstrap = True
    max_features = 'sqrt'
    criterion = ['gini','entropy']
    min_samples_split = 2 
    min_samples_leaf = 1
    max_depth = 3

    penalty = ['l1','l2']
    C = 2.0
    solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag']
    max_iter = 2000

    hyperparametres = {'n_estimators' : n_estimators,
            'bootstrap' : bootstrap,
            'max_features' : max_features,
            'criterion' : criterion,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'penalty' : penalty,
            'C' : C,
            'solver' : solver,
            'max_iter' : max_iter
             }

    # Creates a json object based on `parametres`
    hyperparametres_json = json.dumps(hyperparametres)


    # Saves the json object into a file
    with open(out_file, 'w') as f:
        json.dump(hyperparametres_json, f)
    
    

In [33]:
def decision_tree(hyperparametres_file):

    features, y = data_preprocessing()

    # Open and reads file "parametres"
    with open(hyperparametres_file) as f:
        hyperparametres = json.load(f)
        
    hyperparametres = json.loads(hyperparametres)

    X_train, X_test, y_train, y_test = train_test_split(features, y, test_size = 0.3)

    criterion = hyperparametres['criterion'][1] #entropy

    model = DecisionTreeClassifier(max_depth = hyperparametres['max_depth'], criterion = criterion)
    model.fit(X_train, y_train)

    # Save the model
    dump(model, 'models/decision_tree.joblib')

    # Get predictions
    y_pred = model.predict(X_test)
    
    # Get accuracy
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

In [34]:
def random_forest(hyperparametres_file):
    
    features, y = data_preprocessing()
    
    # Open and reads file "parametres"
    with open(hyperparametres_file) as f:
        hyperparametres = json.load(f)
        
    hyperparametres = json.loads(hyperparametres)
    
    criterion = hyperparametres['criterion'][1] #entropy
    
    X_train, X_test, y_train, y_test = train_test_split(features, y, test_size = 0.3)
    
    # Initialize and train the model
    model = RandomForestClassifier(n_estimators = hyperparametres['n_estimators'], max_features = hyperparametres['max_features'], criterion = criterion, max_depth = hyperparametres['max_depth'] )
    model.fit(X_train, y_train)
    
    # Save the model
    dump(model, 'models/random_forest_model_entropy.joblib')

    # Get predictions
    y_pred = model.predict(X_test)
    
    # Get accuracy
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

In [35]:
def logistic_regression(hyperparametres_file):

    features, y = data_preprocessing()

    # Open and reads file "parametres"
    with open(hyperparametres_file) as f:
        hyperparametres = json.load(f)
        
    hyperparametres = json.loads(hyperparametres)

    X_train, X_test, y_train, y_test = train_test_split(features, y, test_size = 0.3)

    penalty = hyperparametres['penalty'][1] #l2
    solver = hyperparametres['solver'][0] #newton-cg

    model = LogisticRegression(penalty = penalty , C = hyperparametres['C'], solver = solver , max_iter = hyperparametres['max_iter'])
    #model = LogisticRegression()
    model.fit(X_train, y_train)

    # Save the model
    dump(model, 'models/logistic_regression.joblib')

    # Get predictions
    y_pred = model.predict(X_test)
    
    # Get accuracy
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

In [36]:
def show_results(accuracy_file, hyperparametres_file, accuracy_dc, accuracy_rf, accuracy_lr):
    # Given the outputs from decision_tree and logistic regression components
    # the results are shown.
    # Save output into file


    # Open and reads file "parametres"
    with open(hyperparametres_file) as f:
        hyperparametres = json.load(f)
        
    hyperparametres = json.loads(hyperparametres)


    with open(accuracy_file, 'w') as f:
        f.write('Decision treee(accuracy): ' + str(accuracy_dc) + '  max_depth: ' +  str(hyperparametres['max_depth']) +  ' criterion: ' + hyperparametres['criterion'][1] + '\n')
        f.write('Random forest(accuracy): ' + str(accuracy_rf) + '  n_estimators: ' + str(hyperparametres['n_estimators']) + ' max_features: ' + str(hyperparametres['max_features']) + ' criterion: ' + hyperparametres['criterion'][1] + '\n')
        f.write('Logistic_regression(accuracy): ' + str(accuracy_lr) + '  penalty: '  + hyperparametres['penalty'][1] + ' C: ' + str(hyperparametres['C']) + ' solver: '  + hyperparametres['solver'][2] + ' max_iter: '+ str(hyperparametres['max_iter']) + '\n')

    print(f"'Decision treee(accuracy): {accuracy_dc} ")
    print(f"Random forest(accuracy): {accuracy_rf} ")
    print(f"Logistic_regression(accuracy): {accuracy_lr}")

In [37]:
def classify():
    parametres_file = 'parametres.txt'
    hyperparametres_file = 'hyperparametres.txt'
    accuracy_file = 'results/accuracy.txt'
    
    parametres(parametres_file)
    merge_data(parametres_file)
    hyperparametres(hyperparametres_file)
    
    accuracy_dc = decision_tree(hyperparametres_file)
    accuracy_rf = random_forest(hyperparametres_file)
    accuracy_lr = logistic_regression(hyperparametres_file)

    # Given the outputs from "decision_tree" and "logistic_regression"
    # the component "show_results" is called to print the results.
    show_results(accuracy_file, hyperparametres_file, accuracy_dc, accuracy_rf, accuracy_lr)

In [38]:
classify()



Retrieving weather data for milan


Retrieving data for milan from: 2018-01-01 to: 2018-01-31


  list_month_begin = pd.date_range(self.start_date, self.end_date, freq = 'MS', closed = 'right')
  list_month_end = pd.date_range(self.start_date_datetime, self.end_date_datetime, freq='M', closed='left')


Time elapsed (hh:mm:ss.ms) 0:00:01.292311
Retrieving data for milan from: 2018-02-01 to: 2018-02-28
Time elapsed (hh:mm:ss.ms) 0:00:02.519722
Retrieving data for milan from: 2018-03-01 to: 2018-03-31
Time elapsed (hh:mm:ss.ms) 0:00:03.791889
Retrieving data for milan from: 2018-04-01 to: 2018-04-30
Time elapsed (hh:mm:ss.ms) 0:00:04.968309
Retrieving data for milan from: 2018-05-01 to: 2018-05-31
Time elapsed (hh:mm:ss.ms) 0:00:06.271616
Retrieving data for milan from: 2018-06-01 to: 2018-06-30
Time elapsed (hh:mm:ss.ms) 0:00:07.582178
Retrieving data for milan from: 2018-07-01 to: 2018-07-31
Time elapsed (hh:mm:ss.ms) 0:00:08.807655
Retrieving data for milan from: 2018-08-01 to: 2018-08-31
Time elapsed (hh:mm:ss.ms) 0:00:10.086653
Retrieving data for milan from: 2018-09-01 to: 2018-09-30
Time elapsed (hh:mm:ss.ms) 0:00:11.377842
Retrieving data for milan from: 2018-10-01 to: 2018-10-31
Time elapsed (hh:mm:ss.ms) 0:00:12.664515
Retrieving data for milan from: 2018-11-01 to: 2018-11-30


OSError: Cannot save file into a non-existent directory: '/data'