In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
sns.set_style('dark')
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

<h3>Read Data and discover its info</h3>

In [None]:
# Read Data
data = pd.read_csv('data/weatherAUS.csv')

# Show first 5 Rows from data.
data.head()

In [None]:
# Check data dimensions.
# We have 145460 records with 22 independent variables and one dependent variable (RainTomorrow).
data.shape

In [None]:
#Check if data attributes have any NA values
data.isnull().sum()

In [None]:
# From the previous cell we can see that we have alot of NA values in dataset.
# And we have NA values in the dependent variable itself, so we need to remove this records from data.
data = data[data['RainTomorrow'].notna()]
data.shape

In [None]:
# Check percenatage of NA values in data attributes.
data.isnull().sum()/len(data)*100

In [None]:
# From previous cell we can see that Evaporation, Sunshine, Cloud9am and Cloud3pm
# have almost 50% of thier record missing which can have negative effect on our model.
data.drop(['Evaporation','Sunshine','Cloud9am','Cloud3pm'],axis=1,inplace=True)
data.head()

In [None]:
numericCols = data.select_dtypes(include=["float64","int64"]).columns     # Numerical data attributes --> 12 Features
categoricalCols = data.select_dtypes(include="object").columns  # Categorical daata attributes --> 7 Features

# We can notice that we have Categorical data less than numerical.
print(numericCols)
print(categoricalCols)

In [None]:
# Check number of unique values in each categorical attribute to see if we can convert them into numerical.
# Only Date and Location attribute has many different values 3436 and 49 respectivly.
# But we can ignore Location column because it's implicitly defined by the other whether features, and doesn't give new info.
# The same is for Date each date entry is given by day-month-year which is implicitly defined by the other whether features.
for col in categoricalCols:
    print(f"{col}: {data[col].nunique()}")

# Remove flight column
data.drop(['Date','Location'],axis=1,inplace=True)

In [None]:
data.head()

<h3>Visualize Data and Correlations</h3>

In [None]:
# See counts of each class for RainTomorrow in dataset.
plt.figure()
plt.subplot()
plt.title("Classes Count")
sns.countplot(x='RainTomorrow',data=data)
plt.show()
# We can see that data is unbalanced so maybe we need to do resampling.

# See counts of each class for RainToday in dataset.
plt.figure()
plt.subplot()
plt.title("Classes Count")
sns.countplot(x='RainToday',data=data)
plt.show()
# We can notice that RainTomorrow and RainToday has the same histogram, so maybe they affect each other.


In [None]:
# Convert Raintomorrow and RainToday attributes into numericals.
data['RainTomorrow'].replace(['No', 'Yes'],[0, 1], inplace=True)
data['RainToday'].replace(['No', 'Yes'],[0, 1], inplace=True)
data.head()

In [None]:
# Now lets analyze how temp. affect the probability of raining tomorrow.
# We have in our data 4 sources for temp (MaxTemp, MinTemp, Temp9am, Temp3pm)
# Lets how these values affect our target.
sns.heatmap(data[['MinTemp','MaxTemp','Temp3pm','Temp9am', 'RainTomorrow']].corr(), annot=True)

In [None]:
# After seeing the correlation matrix we can notice that these variables are not affect our target variable too much.
# But we can extract useful information from it by taking difference between the temps. 
data['DiffMinMaxTemp'] = data['MaxTemp'] - data['MinTemp']
data['DiffTemp'] = data['Temp3pm'] - data['Temp9am']
data[['DiffMinMaxTemp','MinTemp','MaxTemp','DiffTemp','Temp3pm','Temp9am', 'RainTomorrow']].corr()
sns.heatmap(data[['DiffMinMaxTemp','MinTemp','MaxTemp','DiffTemp','Temp3pm','Temp9am', 'RainTomorrow']].corr(), annot=True)


In [None]:
# Now we can see that we converted the unuseful variable to be more useful and more correlated to our target.
# Lets drop old variables
data.drop(['MinTemp','MaxTemp','Temp3pm','Temp9am'],axis=1,inplace=True)
data.head()

In [None]:
# Now lets analyze how the pressure affect the probability of raining tomorrow.
# We have in our data 2 sources for temp (Pressure3pm, Pressure9am)
# Lets show how these values affect our target.
data['DiffPressure'] = data['Pressure3pm'] - data['Pressure9am']

sns.heatmap(data[['Pressure3pm','Pressure9am','DiffPressure', 'RainTomorrow']].corr(), annot=True)




In [None]:
# After seeing the correlation matrix for pressure with our target.
# We can say that the two variables affect it by almost the same way,
# but we can't extract new better feature as temp.
data.drop('DiffPressure',axis=1,inplace=True)
data.head()

In [None]:
# Now lets analyze how the humidity affect the probability of raining tomorrow.
# We have in our data 2 sources for temp (Humidity3pm, Humidity9am)
# Lets show how these values affect our target.
data['DiffHumidity'] = data['Humidity3pm'] - data['Humidity9am']

sns.heatmap(data[['Humidity3pm','Humidity9am','DiffHumidity', 'RainTomorrow']].corr(), annot=True)

# We can see that Humidity3pm is the most feature affect our target,
# but the other two features affect also but with less effect.

In [None]:
# Lets see if Rainfall affect our target value or not.
print(data[['Rainfall', 'RainTomorrow']].corr())
sns.scatterplot(x=data.index, y=data['Rainfall'], hue=data['RainTomorrow'])

# We can see that Rainfall affect our target slightly and we can notice that,
# for large values of rain fall the probability to raintomorrow increases wrt not to rain. 


In [None]:
# Lets see if RainToday affect our target value or not.
data[['RainToday', 'RainTomorrow']].corr()


In [None]:
# Now lets analyze how the wind affect the probability of raining tomorrow.
# We have in our data 3 sources for temp (WindSpeed9am, WindSpeed3pm, WindGustSpeed)
# Lets show how these values affect our target.
data[['WindGustSpeed',	'WindSpeed9am',	'WindSpeed3pm', 'RainTomorrow']].corr()


In [None]:
# We can see that only WindGustSpeed affect our target slightly.
data.drop(['WindSpeed9am',	'WindSpeed3pm'],axis=1,inplace=True)
data.head()

In [None]:
# See counts of each WindDir9am category in dataset.
plt.figure()
plt.title("WindDir9am Counts")
sns.countplot(x='WindDir9am',data=data)
plt.show()
# We can see that WindDir9am has many outliers, but also it can diffrentiate between classes.
plt.figure()
plt.title("WindDir9am Effect on Price")
sns.boxplot(x=data['WindDir9am'], y=data['RainTomorrow'])
plt.show()

In [None]:
# See counts of each WindDir3pm category in dataset.
plt.figure()
plt.title("WindDir3pm Counts")
sns.countplot(x='WindDir3pm',data=data)
plt.show()
# We can see that WindDir3pm has many outliers, but also it can diffrentiate between classes.
plt.figure()
plt.title("WindDir3pm Effect on Price")
sns.boxplot(x=data['WindDir3pm'], y=data['RainTomorrow'])
plt.show()

In [None]:

# See counts of each WindGustDir category in dataset.
plt.figure()
plt.title("WindGustDir Counts")
sns.countplot(x='WindGustDir',data=data)
plt.show()
# We can see that WindGustDir has many outliers, but also it can diffrentiate between classes.
plt.figure()
plt.title("WindGustDir Effect on Price")
sns.boxplot(x=data['WindGustDir'], y=data['RainTomorrow'])
plt.show()

In [None]:
# Convert WindDir9am into numerical attribute.
data["WindDir9am"] = data["WindDir9am"].astype('category')
data["WindDir9am"] = data["WindDir9am"].cat.codes

# Convert WindDir3pm into numerical attribute.
data["WindDir3pm"] = data["WindDir3pm"].astype('category')
data["WindDir3pm"] = data["WindDir3pm"].cat.codes

# Convert WindGustDir into numerical attribute.
data["WindGustDir"] = data["WindGustDir"].astype('category')
data["WindGustDir"] = data["WindGustDir"].cat.codes

data.head()


In [None]:
# Now lets see how each attribute affect our target.

sns.heatmap(data[['WindDir9am','WindDir3pm','WindGustDir', 'RainTomorrow']].corr(), annot=True)

# We can see that Wind direction info is not very important, it doesn't affect our target significantly.

In [None]:
# Winddir info has many outliers affect the model negativily
data.drop(['WindDir9am','WindDir3pm','WindGustDir'],axis=1,inplace=True)
data.head()

In [None]:
# Now fill the ramaining NA values in dataset by -100 value.
data.fillna(-100,inplace=True)
data.head()

<h3>Training and Model building</h3>

In [None]:
# Split independent and dependent variables
x = data.drop('RainTomorrow',axis=1)
y = data['RainTomorrow']

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=8)


In [None]:
selector = SelectKBest( k=3)
selector.fit(X_train, y_train)
X_train.columns[selector.get_support(indices=True)].tolist()

In [None]:
#Apply Logistic regression grid params
LM = LogisticRegression()

# grid search for LM 
params_LM = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C'      : np.arange(1,5,0.5),
    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}
clf_LM  = GridSearchCV(LM,params_LM)

clf_LM.fit(X_train, y_train)
sorted(clf_LM.cv_results_.keys())
print(clf_LM.best_params_)


In [None]:
#Apply Logistic regression 
LM = LogisticRegression(C=1.0, penalty='l2', solver='newton-cg')
LM.fit(X_train, y_train)
accuracy = LM.score(X_test, y_test)
accuracy

In [None]:
#Apply Random forest regression grid params
RF = RandomForestClassifier()

# grid search for RF 
params_RF = {
    'n_estimators': np.arange(100,1400,200),
    'max_depth'   : np.arange(100,300,100),
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_leaf': np.arange(1,4,1)
}
clf_RF  = GridSearchCV(RF,params_RF)

clf_RF.fit(X_train, y_train)
sorted(clf_RF.cv_results_.keys())
print(clf_RF.best_params_)

In [None]:
#Apply RandomForest
LM = RandomForestClassifier(max_depth= 100, max_features= 'log2', min_samples_leaf= 3, n_estimators= 1300)
LM.fit(X_train, y_train)
accuracy = LM.score(X_test, y_test)
accuracy

In [None]:
#Apply Decision tree regression grid params
DT = DecisionTreeClassifier()

# grid search for DT
params_DT = {
    'max_depth'   : np.arange(100,1800,100),
    'criterion'   : ['gini', 'entropy'],
}
clf_DT  = GridSearchCV(DT,params_DT)

clf_DT.fit(X_train, y_train)
sorted(clf_DT.cv_results_.keys())
print(clf_DT.best_params_)

In [None]:
#Apply DecisionTree
LM = DecisionTreeClassifier(criterion ='entropy', max_depth= 1000)
LM.fit(X_train, y_train)
accuracy = LM.score(X_test, y_test)
accuracy

In [None]:
#Apply MLP regression grid params
MLP = MLPClassifier()

# grid search for MLP 
params_MLP = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation'        : ['tanh', 'relu'],
    'solver'            : ['sgd', 'adam'],
    'learning_rate'     : ['constant', 'adaptive'],
    'alpha'             : [0.0001, 0.05],
}
clf_MLP  = GridSearchCV(MLP,params_MLP)

clf_MLP.fit(X_train, y_train)
sorted(clf_MLP.cv_results_.keys())
print(clf_MLP.best_params_)

In [None]:
#Apply MLP
LM = MLPClassifier(activation= 'relu', alpha= 0.0001, hidden_layer_sizes= (50, 100, 50), learning_rate= 'adaptive', solver= 'adam')
LM.fit(X_train, y_train)
accuracy = LM.score(X_test, y_test)
accuracy

In [None]:
#Apply SVM regression grid params
SVM = SVC()

# grid search for SVM 
params_SVM = {
    'C'        : np.arange(1,3,1),
    'kernel'   : ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
    'gamma'    : ['scale', 'auto'] 
}
clf_SVM  = GridSearchCV(SVM,params_SVM)

clf_SVM.fit(X_train, y_train)
sorted(clf_SVM.cv_results_.keys())
print(clf_SVM.best_params_)

In [None]:
#Apply SVM
LM = SVC(C= 1, kernel= 'linear', gamma= 'scale', max_iter= 1600)
LM.fit(X_train, y_train)
accuracy = LM.score(X_test, y_test)
accuracy

In [None]:
#Apply KNN regression grid params
KNN = KNeighborsClassifier()

# grid search for LM 
params_KNN = {
    'leaf_size'         : np.arange(30,100,10),
    'metric'            : ['minkowski', 'precomputed'],
    'n_neighbours'      : np.arange(5,15,1), 
    'p'                 : np.arange(1,5,1),
    'weights'           : ['uniform', 'distance']
}
clf_KNN  = GridSearchCV(KNN,params_KNN)

clf_KNN.fit(X_train, y_train)
sorted(clf_KNN.cv_results_.keys())
print(clf_KNN.best_params_)

In [None]:
#Apply KNN
LM = KNeighborsClassifier(leaf_size= 40, metric= 'minkowski', n_neighbors= 10, p= 2, weights= 'uniform')
LM.fit(X_train, y_train)
accuracy = LM.score(X_test, y_test)
accuracy