Author: Mert Arcan


Analysis of Rain in Austuralia Dataset

Weather in Australia dataset: https://www.kaggle.com/datasets/jsphyg/weather-dataset-rattle-package

Python version: Python 3.10.4

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from pandas.plotting import scatter_matrix
weatherAUS = pd.read_csv("weatherAUS.csv")
num_Columns = weatherAUS.select_dtypes(include="float64").columns
cat_Columns = weatherAUS.select_dtypes(include="object").columns



In [None]:
def preProcess(dataFrame):
    df = dataFrame
    # Drop the values that target variable is NaN
    df=df[(df["RainTomorrow"]=="Yes")|(df["RainTomorrow"]=="No")]
    # For the rest of Categorical Variables, use the mod of that column to replace NaN's
    df['WindGustDir'] = df['WindGustDir'].fillna(df['WindGustDir'].mode()[0])
    df['WindDir9am'] = df['WindDir9am'].fillna(df['WindDir9am'].mode()[0])
    df['WindDir3pm'] = df['WindDir3pm'].fillna(df['WindDir3pm'].mode()[0])
    df['RainToday'] = df['RainToday'].fillna(df['RainToday'].mode()[0])



    la = LabelEncoder()
    l = []
    for i in df.columns:
        if df.dtypes[i]=='O':
            l.append(i)
    for i in l:
        print(i)
        df[i] = la.fit_transform(df[i])
        
    # For the rest numeric values, take the mean of these columns to replace NaN's
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputer = imputer.fit(df)
    df = pd.DataFrame(imputer.transform(df), columns= df.columns, index = df.index)
    return df


def removeOutliers(dataFrame,outlier_features):
    df = dataFrame
    for x in outlier_features:
        q25=np.quantile(df[x],0.25)
        q75=np.quantile(df[x],0.75)
        iqr=q75-q25
        lower=round(q25-1.5*iqr,2)
        upper=round(q75+1.5*iqr,2)
        print(x,": Upper: ",upper," || Lower: ",lower)
        df=df[(df[x]<upper) & (df[x]>lower)]
    return df

def minMax_scale(dataFrame):
    df = dataFrame
    minmaxScaler = MinMaxScaler()
    scaled = minmaxScaler.fit_transform(df)
    df = pd.DataFrame(scaled,columns= df.columns)
    return df


def scale(X_train,X_test):
    xtrain = X_train
    xtest = X_test
    scaler = StandardScaler()
    scaler.fit(X_train)
    xtrain = scaler.transform(xtrain)
    xtest = scaler.transform(xtest)
    xtrain = pd.DataFrame(xtrain,columns = X_train.columns, index = X_train.index)
    xtest = pd.DataFrame(xtest,columns = X_test.columns, index=X_test.index)
    return xtrain,xtest

def my_normalize(dataFrame):
    df = dataFrame
    normalizer = Normalizer()
    normalized = normalizer.fit_transform(df)
    normalized_df = pd.DataFrame(normalized,columns= df.columns)
    return normalized_df

def checkNaN(dataFrame):
    for i in dataFrame.columns:
        if(dataFrame[i].isnull().values.any()):
            count = dataFrame[i].isnull().sum()
            print(i,", count of NaNs:",count)
        else:
            print("No NaN Values in ",i)
            


    


Data Analysis

In [None]:
weatherAUS.info()

In [None]:
weatherAUS.describe().T

In [None]:

scatter_matrix(weatherAUS,alpha = 0.1, figsize=(20,12))


In [None]:
plt.scatter(weatherAUS["Temp3pm"],weatherAUS["Temp9am"], alpha = 0.1)
plt.title("Temp3pm / Temp9am")
plt.xlabel("Temp3pm")
plt.ylabel("Temp9am")

In [None]:
plt.scatter(weatherAUS["Pressure3pm"],weatherAUS["Pressure9am"], alpha = 0.1)
plt.title("Pressure3pm / Pressure9am")
plt.xlabel("Pressure3pm")
plt.ylabel("Pressure9am")



In [None]:
plt.scatter(weatherAUS["MinTemp"],weatherAUS["MaxTemp"], alpha = 0.1)
plt.title("MinTemp / MaxTemp")
plt.xlabel("MinTemp")
plt.ylabel("MaxTemp")



In [None]:
weatherAUS["Temp_diff"] = weatherAUS["Temp3pm"] - weatherAUS["Temp9am"]
weatherAUS["Pressure_diff"] = weatherAUS["Pressure3pm"] - weatherAUS["Pressure9am"]
weatherAUS["MinMaxTemp_diff"] = weatherAUS["MaxTemp"] - weatherAUS["MinTemp"]
weatherAUS.head()


In [None]:
checkNaN(weatherAUS)

In [None]:
weatherAUS = weatherAUS.drop("Date", axis = 1)
weatherAUS.head()

Split Train / Test

In [None]:
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split

train_set_Org, test_set_Org = train_test_split(weatherAUS,test_size=0.2, random_state=442)




In [None]:
test_set_Org.sort_index()

Data PreProcessing

In [None]:
train_set = preProcess(train_set_Org)

In [None]:
train_set_Org.head().T # Original dataset

In [None]:
train_set.head().T # Preprocessed dataset

Outlier Removal

In [None]:
train_set.describe().T
# Rainfall
# Evaporation
# WindGustSpeed
# WindSpeed9am
# WindSpeed3pm
# may contain outliers 

In [None]:
outliers = []
outliers.append("Rainfall")
outliers.append("Evaporation")
outliers.append("WindSpeed3pm")
outliers.append("WindSpeed9am")
outliers.append("WindGustSpeed")
sns.set(rc = {'figure.figsize':(10,5)})
train_set[outliers].boxplot()

In [None]:
train_set = removeOutliers(train_set,outliers)

In [None]:
train_set[outliers].boxplot()

Feature Selection

In [None]:
col=['MinTemp', 'MaxTemp','Pressure9am', 'Pressure3pm','Temp9am', 'Temp3pm','RainTomorrow']
train_set_corr = train_set[col].corr()
#train_set_corr["RainTomorrow"].sort_values(ascending= False)
sns.heatmap(train_set_corr,annot=True)

In [None]:
col_diff = ["MinMaxTemp_diff", "Pressure_diff", "Temp_diff","RainTomorrow"]
train_set_diff_corr = train_set[col_diff].corr()
sns.heatmap(train_set_diff_corr,annot=True)


In [None]:
dropcol = ['MinTemp', 'MaxTemp','Pressure9am', 'Pressure3pm','Temp9am', 'Temp3pm']
test_set = preProcess(test_set_Org)
test_set = removeOutliers(test_set,outliers)
train_set = train_set.drop(dropcol, axis = 1)
test_set = test_set.drop(dropcol, axis = 1)





In [None]:
X_test = test_set.drop("RainTomorrow", axis = 1)
X_train = train_set.drop("RainTomorrow", axis = 1)
y_train = train_set.RainTomorrow
y_test = test_set.RainTomorrow

In [None]:
corr = train_set.corr()
corr["RainTomorrow"].sort_values(ascending = False)

Standardization

In [None]:
X_train, X_test = scale(X_train = X_train,X_test= X_test)
X_train.hist(bins= 50, figsize=(20,12))

Model Selection

In [None]:
from sklearn.model_selection import cross_val_score
def printScores(scores,name):
    print("Scores for -->",name)
    print("Scores:",scores)
    print("Mean (%):",scores.mean()*100)
    print("Scores Mean:",scores.mean())
    print("Scores std:",scores.std())
    print("Scores std:(%)",scores.std()*100)


def compareModels(cNames,classifiers,X,Y):
    for c in range(len(classifiers)):
        model = classifiers[c]
        scores = cross_val_score(model,X,Y,scoring="accuracy",cv = 10)
        printScores(scores,cNames[c])
        print("\n-------------------------------------------------------\n")
        
    

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

names = [
    
    "Decision Tree",
    "Random Forest",
    "Nearest Neighbors",
    "Multi-layer Perceptron",
    "Gaussian Naive Bayes",

]
classifiers =[
    
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    KNeighborsClassifier(),
    MLPClassifier(alpha = 1),
    GaussianNB(),

]

In [None]:
compareModels(names,classifiers,X= X_train, Y= y_train)


Fine-Tune model

In [None]:
from sklearn.metrics import accuracy_score
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    accuracy = accuracy_score(test_labels, predictions) * 100
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 60, stop = 500, num = 10)]
# Number of features to consider at every split
max_features = [2,4,6,8,10,12,14,16,18,20]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'bootstrap': bootstrap}

In [None]:
# Use the random grid to search for best hyperparameters
rf = RandomForestClassifier()
# Random search of parameters, using 5 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

rf_random.best_params_



In [None]:
base_model = RandomForestClassifier()
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

In [None]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

In [None]:
print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

In [None]:
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier()
param_grid = [
    
    {'bootstrap': [True],'n_estimators': [200,300,400], 'max_features':[8,10,12,14]},
    {'bootstrap': [False] ,'n_estimators': [200,400], 'max_features':[4,8,12]},

]
grid_search = GridSearchCV(clf,param_grid,cv = 5, scoring = "accuracy")
grid_search.fit(X_train, y_train)
# total of (3x4 + 3 x 2) * 5 = 90 trains

grid_search.best_params_

In [None]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)

In [None]:
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - random_accuracy) / base_accuracy))

Model Construction 

In [None]:

clf = RandomForestClassifier(n_estimators= 300, max_features=8, bootstrap= True)
clf.fit(X_train,y_train)


Prediction Analysis

In [None]:
preds = clf.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
c_matrix =confusion_matrix(y_test,preds)
sns.heatmap(c_matrix.T,square= True, annot= True, fmt= 'd', xticklabels = ["No","Yes"],yticklabels=["No","Yes"])
plt.xlabel('Test Values')
plt.ylabel('Predictions')



In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
report = classification_report(y_test,preds,output_dict=True,target_names= ["No","Yes"])
df = pd.DataFrame(report).transpose()
print(df)


In [None]:
allY = pd.DataFrame(data = y_test)
allY["Predicted"] = preds
allY.rename(columns= {"RainTomorrow":"Actual"}, inplace=True)
incorrect = allY[allY["Actual"] != allY["Predicted"]]
incorrect_index = incorrect.index
incorrect_rows = weatherAUS.iloc[incorrect_index]
incorrect_rows.T