In [1]:
import math, random
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, label_binarize
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier



In [2]:
#metodi e classi che utilizzerò in seguito 

class Judge: #definisco la classe che mi permette di definire quale algoritmo performi meglio sul dataset
    
    def __init__(self):
        self.params = {}
        return None
    
    def set_algorithms(self, algorithms):
        self.algorithms = algorithms
        return None
    
    def set_data(self, X, y):
        self.X = X
        self.y = y
        return None
    
    def set_metrics(self, metrics):
        self.metrics = metrics
        return None
    
    def set_params(self, params):
        self.params = params
        return None
    
    def get_performance(self, metric, algorithm, grid):
        best_algorithm = algorithm
        if bool(grid):
            best_algorithm = GridSearchCV(estimator = algorithm, param_grid = grid)  # inner loop
        scores = cross_validate(best_algorithm, X = self.X, y = self.y, scoring = metric, cv = 10)  # outer_loop
        score = np.mean(scores['test_score'])
        score = round(score, 2)
        
        return score
        
    def get_table(self):
        metrics_results = {}
        for metric in self.metrics:
            algorithms_results = {}
            for label, algorithm in self.algorithms.items():
                grid = {}
                if label in self.params.keys():
                    grid = self.params[label]
                algorithms_results[label] = self.get_performance(metric, algorithm, grid)
            metrics_results[metric] = algorithms_results
            
        df = pd.DataFrame.from_dict(metrics_results)
                
        return df
    

# One-hot encoding - definisco una funzione che vada a splittare i campi categorici per permettere all'algoritmo di interpretarli come numeri
def ohe(data, column):
    d = pd.get_dummies(data[column], prefix = column)
    data_d = pd.concat([data, d], axis = 1)
    data_d = data_d.drop([column], axis = 1)
    return data_d

def min_max_scaler_single_column(column):
    col_min = np.min(column)
    col_max = np.max(column)
    column = (column - col_min) / (col_max - col_min)
    
    return column

#per gestire categorie creando nuova colonna binaria e mettendo 0 dove c'è NaN e 1 altrimenti 

def handle_missing_value_by_replace_and_add(data, column, nan_value):
    new_binary_column_name = column + '_null'
    new_binary_column = data[column]
    new_binary_column = np.where(new_binary_column.isnull(), 1, 0)  # np.nan è diverso da pd.NaN
    data[new_binary_column_name] = new_binary_column
    data[column] = data[column].fillna(nan_value)
    return data

#conto i null nel df
def null_count(data):
    return data.isnull().sum()

In [21]:
df = pd.read_csv('weatherAUS.csv')
#df.shape
#df.describe
df

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,2017-06-21,Uluru,2.8,23.4,0.0,,,E,31.0,SE,...,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No
145456,2017-06-22,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,...,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No
145457,2017-06-23,Uluru,5.4,26.9,0.0,,,N,37.0,SE,...,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No
145458,2017-06-24,Uluru,7.8,27.0,0.0,,,SE,28.0,SSE,...,51.0,24.0,1019.4,1016.5,3.0,2.0,15.1,26.0,No,No


In [22]:
#Verifico la quantità di null per ciascuna colonna
df.isnull().sum()

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64

In [23]:
#gestione dei null sulla colonna booleana RaiTODAY e sul target RaiTomorrow
df.dropna(subset = ['RainTomorrow'], inplace=True) #quest'azione va a togliere solo le righe per le quali nella colonna RainTomorrow sono presenti dei null poichè non possono essere gestiti essendo un campo booleano
df.dropna(subset = ['RainToday'], inplace=True) #tolgo anche qui le righe per le quali RainToday è NaN poichè essendo un campo booleano non posso gestirlo mettendoci la media di un campo o un valore costante poichè non avrebbe senso
df.isnull().sum()

Date                 0
Location             0
MinTemp            468
MaxTemp            307
Rainfall             0
Evaporation      59694
Sunshine         66805
WindGustDir       9163
WindGustSpeed     9105
WindDir9am        9660
WindDir3pm        3670
WindSpeed9am      1055
WindSpeed3pm      2531
Humidity9am       1517
Humidity3pm       3501
Pressure9am      13743
Pressure3pm      13769
Cloud9am         52625
Cloud3pm         56094
Temp9am            656
Temp3pm           2624
RainToday            0
RainTomorrow         0
dtype: int64

In [24]:
#droppo le colonne che hanno quasi la metà di valori null poichè ininfluenti e non saprei come gestire i valori null, è corretto?

df = df.drop(['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm'], axis = 1)


In [25]:
#Trasformo i due campi booleani in numeri 0/1
df['RainToday'] = df['RainToday'].replace('No', 0)
df['RainToday'] = df['RainToday'].replace('Yes', 1)
df['RainTomorrow'] = df['RainTomorrow'].replace('No', 0)
df['RainTomorrow'] = df['RainTomorrow'].replace('Yes', 1)

In [27]:
#per mantenere la data, in modo tale che risulti come numero coerente, rimuovo il trattino, in modo tale da continuare a mantenere tutta l'info sulla data
#df['Date']= df['Date'].replace(to_replace=r'-', value='', regex=True,inplace=True)

df['Date'].replace(to_replace=r'-', value='', regex=True,inplace=True)

In [28]:
#trasformo location, che è categorico, in valori tramite OHE poichè non voglio attribuirgli dei numeri, in quanto creerei un campo ordinato di valori

#location = df['Location'].astype(str).str[0]
#location_df = pd.DataFrame({'Location': location})
#location_ohe = ohe(location_df, 'Location')
#Sistemo il mio dataframe droppando la colonna state e concatenando l'ohe appena computato
#df = pd.concat([df, location_ohe], axis = 1)
#df = df.drop(['Location'], axis = 1)

#Faccio l'ohe anche per i campi 'WindGustDir', 'WindDir9am', 'WindDir3pm'

#WindGustDir = df['WindGustDir'].astype(str).str[0]
#WindGustDir_df = pd.DataFrame({'WindGustDir': WindGustDir})
#WindGustDir_ohe = ohe(WindGustDir_df, 'WindGustDir')

#WindDir9am = df['WindDir9am'].astype(str).str[0]
#WindDir9am_df = pd.DataFrame({'WindDir9am': WindDir9am})
#WindDir9am_ohe = ohe(WindDir9am_df, 'WindDir9am')


#WindDir3pm = df['WindDir3pm'].astype(str).str[0]
#WindDir3pm_df = pd.DataFrame({'WindDir3pm': WindDir3pm})
#WindDir3pm_ohe = ohe(WindDir3pm_df, 'WindDir3pm')


#df = pd.concat([df, WindGustDir_ohe,WindDir9am_ohe,WindDir3pm_ohe], axis = 1)
#df = df.drop(['WindGustDir'], axis = 1)
#df = df.drop(['WindDir3pm'], axis = 1)
#df = df.drop(['WindDir9am'], axis = 1)


In [29]:
#EDIT degli OHE precedenti, generano troppe colonne e non so come gestirli tramite OHE, dunque provo a trasformare le variabili categoriche
#in numeriche assegnando a parità di variabile categorica un numero :


gestire = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']
for column in gestire:
    campi_univoci = df[column].value_counts().index.tolist() 
    #conteggia le righe di tutti i campi univoci e i nomi fungono da indici che butto in una lista per poterli gestire e mapparli con dei numeri
    associazione = {x:y for x,y in zip(campi_univoci, range(len(campi_univoci)))}
    df[column] = df[column].map(associazione) 
    


In [30]:
#verifico dove ci siano ancora null e se questi campi sono numerici. Se tutti i campi che hanno NaN in alcune righe sono numerici posso fillare con la media dei valori presenti nelle colonne
#Quali colonne da gestire?
colonne_da_gestire = df.columns[df.isnull().any()]
colonne_da_gestire
#le gestisco con la media dei valori del campo 

for column in colonne_da_gestire:
    df[column].fillna(df[column].mean(), inplace=True)
df.isnull().sum()

Date             0
Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
dtype: int64

In [31]:
df.isnull().sum()

Date             0
Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
dtype: int64

In [32]:
df

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,20081201,20,13.4,22.9,0.6,0.0,44.0,6.0,7.0,20.0,24.0,71.0,22.0,1007.7,1007.1,16.9,21.8,0,0
1,20081202,20,7.4,25.1,0.0,9.0,44.0,9.0,3.0,4.0,22.0,44.0,25.0,1010.6,1007.8,17.2,24.3,0,0
2,20081203,20,12.9,25.7,0.0,6.0,46.0,6.0,3.0,19.0,26.0,38.0,30.0,1007.6,1008.7,21.0,23.2,0,0
3,20081204,20,9.2,28.0,0.0,13.0,24.0,1.0,10.0,11.0,9.0,45.0,16.0,1017.6,1012.8,18.1,26.5,0,0
4,20081205,20,17.5,32.3,1.0,0.0,41.0,10.0,8.0,7.0,20.0,82.0,33.0,1010.8,1006.0,17.8,29.7,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145454,20170620,48,3.5,21.8,0.0,2.0,31.0,11.0,10.0,15.0,13.0,59.0,27.0,1024.7,1021.2,9.4,20.9,0,0
145455,20170621,48,2.8,23.4,0.0,2.0,31.0,1.0,14.0,13.0,11.0,51.0,24.0,1024.6,1020.3,10.1,22.4,0,0
145456,20170622,48,3.6,25.3,0.0,14.0,22.0,1.0,6.0,13.0,9.0,56.0,21.0,1023.5,1019.1,10.9,24.5,0,0
145457,20170623,48,5.4,26.9,0.0,3.0,37.0,1.0,7.0,9.0,9.0,53.0,24.0,1021.0,1016.8,12.5,26.1,0,0


In [33]:
#ora che i dati dovrebbero essere sistemati procedo con la rimozione della colonna label e assegnazione X e Y per la predizione di "RainTomorrow"

#test = df.sample( n= 50000) #riduco dataset per tempi di performance lunghi
test = df.sample( n= 35000) #riduco dataset per tempi di performance lunghi
X = test.drop(['RainTomorrow'], axis=1) 
y = test['RainTomorrow'] 

print(X.shape,y.shape)
X

(35000, 18) (35000,)


Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday
57283,20150218,12,13.3,22.3,0.0,4.0,48.0,1.000000,0.0,26.0,26.0,84.0,57.0,1019.9,1019.3,15.4,20.7,0
41449,20140624,44,9.4,16.1,0.0,9.0,94.0,14.000000,7.0,48.0,50.0,44.0,31.0,1009.5,1007.6,13.1,15.8,0
131254,20120616,4,5.4,12.1,0.6,14.0,30.0,9.000000,13.0,19.0,11.0,86.0,68.0,1008.6,1005.6,6.0,11.7,0
73223,20161229,15,24.8,34.1,0.0,0.0,35.0,9.000000,3.0,6.0,19.0,67.0,42.0,1003.4,1001.7,28.6,32.5,0
81122,20090505,32,3.3,16.7,1.0,5.0,20.0,6.982894,2.0,0.0,11.0,99.0,68.0,1033.0,1029.8,7.0,16.2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89241,20141001,25,18.6,29.2,0.0,12.0,41.0,3.000000,9.0,17.0,28.0,52.0,48.0,1018.5,1014.1,24.4,27.5,0
87504,20091001,25,19.4,30.0,0.0,2.0,39.0,3.000000,10.0,19.0,28.0,54.0,46.0,1017.4,1013.6,26.2,28.1,0
63740,20160401,19,7.9,25.5,0.0,10.0,43.0,4.000000,14.0,7.0,11.0,98.0,46.0,1016.2,1011.5,12.9,24.1,0
140495,20120515,3,21.6,31.7,0.0,12.0,46.0,1.000000,9.0,26.0,22.0,55.0,33.0,1013.4,1010.0,24.2,31.2,0


In [34]:
#guardo alle predizioni per vedere quanto il valore y sia bilanciato o sbilanciato

y.value_counts()

0    27344
1     7656
Name: RainTomorrow, dtype: int64

In [35]:
#Il risultato è che i due valori 0 e 1 sono molto sbilanciati tra di loro dunque utilizzo un metodo per riequilibrare i valori
# SMOTE
#rus = SMOTE(random_state=42)
#X_smote, y_smote = rus.fit_resample(X, y)
##print(X.shape)
#print(y.shape)
#print(X_smote.shape)
#print(y_smote.shape)

#Provo con RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X, y)
print(X.shape)
print(y.shape)
print(X_rus.shape)
print(y_rus.shape)

(35000, 18)
(35000,)
(15312, 18)
(15312,)


In [38]:
#Qual è il migliore algoritmo?
algorithms = {
    'knn': KNeighborsClassifier(),
    'lr': LogisticRegression(solver = 'liblinear'),
    'dt': DecisionTreeClassifier(),
    'svc': LinearSVC()
}

# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
metrics = [
    'accuracy',
    'roc_auc',
    'precision'
]

params = {
    'knn': {
        'n_neighbors': [3, 5, 10],
        'weights': ['uniform', 'distance']
    },
    'dt': {
        'max_depth': [None, 5, 10, 20]
    }
}
    
j = Judge()
j.set_algorithms(algorithms)
j.set_data(X_rus, y_rus)
j.set_metrics(metrics)
j.set_params(params)
j.get_table()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))




Unnamed: 0,accuracy,roc_auc,precision
knn,0.75,0.82,0.77
lr,0.77,0.85,0.78
dt,0.76,0.83,0.79
svc,0.5,0.53,0.15


In [39]:
algorithm =  LogisticRegression(solver = 'liblinear')
clf = GridSearchCV(algorithm, param_grid = {}) 
clf.fit(X, y)
clf.best_estimator_

LogisticRegression(solver='liblinear')