In [318]:
# Surpress warnings:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [319]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, accuracy_score
import sklearn.metrics as metrics

In [320]:
df=pd.read_csv("Weather_Data.csv")

In [321]:
df.head()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2/1/2008,19.5,22.4,15.6,6.2,0.0,W,41,S,SSW,...,92,84,1017.6,1017.4,8,8,20.7,20.9,Yes,Yes
1,2/2/2008,19.5,25.6,6.0,3.4,2.7,W,41,W,E,...,83,73,1017.9,1016.4,7,7,22.4,24.8,Yes,Yes
2,2/3/2008,21.6,24.5,6.6,2.4,0.1,W,41,ESE,ESE,...,88,86,1016.7,1015.6,7,8,23.5,23.0,Yes,Yes
3,2/4/2008,20.2,22.8,18.8,2.2,0.0,W,41,NNE,E,...,83,90,1014.2,1011.8,8,8,21.4,20.9,Yes,Yes
4,2/5/2008,19.7,25.7,77.4,4.8,0.0,W,41,NNE,W,...,88,74,1008.3,1004.8,8,8,22.5,25.5,Yes,Yes


In [322]:
#Converting the categorical varaibles to numerical binary values using one hot encoding method
#The one hot encoding method creates multiple columns based on every possible categorical value involved in the column
#So, for rain today, since the choices are only yes or no, two columns are created one that is a yes for rain today and one that is a no for rain today, and 1s and 0s are the numeric values used to indicate the characteristic
#Same idea for WindGustDirec but now 4 columns are made for N, S, E, and W
#If creating a decision tree and other tree-based algorithms, label encoding is better since the order is determined by the model, so the values assigned to the categorical values of the column don't matter in terms of order

df_processed=pd.get_dummies(data=df, columns=["RainToday", "WindGustDir", "WindDir9am", "WindDir3pm"])

In [323]:
#RainTomorrow is the target value, but it too consists of categorical values, so the in-place replacement method will be used to give it numeric values
#Same as LabelEncoder essentially, but more efficient, and easier to assign the numeric values since there are only two categorical values
df_processed.replace(["No", "Yes"], [0,1], inplace=True)

In [324]:
#Need to convert all the data to float, but since Date doesn't follow that structure, and isn't a necessary feature, it will be dropped
#Then, every data point in the dataset will be converted to the type float since ML algorithms require numeric outputs often such as floats, and having the same datatype in the DataFrame is necessary for some computations

df_processed.drop(columns="Date", axis=1, inplace=True) #Whether that word columns is written out to indicate this parameter or not isn't crucial, but the use of inplace=True is
#inplace=True ensures that when the Date column is dropped, the original df_processed DataFrame is permanently altered by dropping the Date col and no new DataFrame is created
df_processed=df_processed.astype(float)

In [325]:
#Setting up the features (X) and target variable (Y)

features=df_processed.drop(columns="RainTomorrow", axis=1) #Since inplace isn't set to true here, the column RainTomorrow is dropped in the new x DataFrame, but the same change doesn't remain for the original DataFrame df_processed
Y=df_processed["RainTomorrow"]

In [373]:
#Q1)  Use the train_test_split function to split the features and Y dataframes with a test_size of 0.2 and the random_state set to 10.

x_train, x_test, y_train, y_test = train_test_split(features, Y, test_size=0.2, random_state=10)

In [374]:
#Q2) Create and train a Linear Regression model called LinearReg using the training data (x_train, y_train)
LinearReg=LinearRegression()
LinearReg.fit(x_train, y_train)

In [375]:
#Q3) Now use the predict method on the testing data (x_test) and save it to the array predictions.
predictions = np.asanyarray(LinearReg.predict(x_test))
print(df_processed.shape)
print(df_processed.columns)
print(LinearReg.coef_)

(3271, 67)
Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
       'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',
       'Temp9am', 'Temp3pm', 'RainTomorrow', 'RainToday_No', 'RainToday_Yes',
       'WindGustDir_E', 'WindGustDir_ENE', 'WindGustDir_ESE', 'WindGustDir_N',
       'WindGustDir_NE', 'WindGustDir_NNE', 'WindGustDir_NNW',
       'WindGustDir_NW', 'WindGustDir_S', 'WindGustDir_SE', 'WindGustDir_SSE',
       'WindGustDir_SSW', 'WindGustDir_SW', 'WindGustDir_W', 'WindGustDir_WNW',
       'WindGustDir_WSW', 'WindDir9am_E', 'WindDir9am_ENE', 'WindDir9am_ESE',
       'WindDir9am_N', 'WindDir9am_NE', 'WindDir9am_NNE', 'WindDir9am_NNW',
       'WindDir9am_NW', 'WindDir9am_S', 'WindDir9am_SE', 'WindDir9am_SSE',
       'WindDir9am_SSW', 'WindDir9am_SW', 'WindDir9am_W', 'WindDir9am_WNW',
       'WindDir9am_WSW', 'WindDir3pm_E', 'WindDir3pm_ENE', 'WindDir3pm_ESE',
       'WindD

In [376]:
#Q4) Using the predictions and the y_test dataframe calculate the value for each metric using the appropriate function.
LinearRegression_MAE = np.mean(np.absolute(predictions - y_test))
print(LinearRegression_MAE)
LinearRegression_MSE = np.mean((predictions - y_test)**2)
print(LinearRegression_MSE)
LinearRegression_R2 = metrics.r2_score(y_test, predictions)
print(LinearRegression_R2)

0.2563175026697057
0.11572058021725573
0.4271321202839915


In [377]:
#Q5) Show the MAE, MSE, and R2 in a tabular format using data frame for the linear model.
Report=pd.DataFrame()
Report["LinearRegEval"] = [LinearRegression_MAE,LinearRegression_MSE, LinearRegression_R2]
Report.head()

Unnamed: 0,LinearRegEval
0,0.256318
1,0.115721
2,0.427132


In [378]:
#KNN Model

#Q6) Create and train a KNN model called KNN using the training data (x_train, y_train) with the n_neighbors parameter set to 4
KNN = KNeighborsClassifier(n_neighbors=4).fit(x_train, y_train)


In [379]:
#Q7) Now use the predict method on the testing data (x_test) and save it to the array predictions.

predictions=np.asanyarray(KNN.predict(x_test))

In [380]:
#Q8) Using the predictions and the y_test dataframe calculate the value for each metric using the appropriate function
KNN_Accuracy_Score = metrics.accuracy_score(y_test, predictions)
print(KNN_Accuracy_Score)
KNN_JaccardIndex = metrics.jaccard_score(y_test, predictions)
print(KNN_JaccardIndex)
KNN_F1_Score = metrics.f1_score(y_test, predictions)
print(KNN_F1_Score)

0.8183206106870229
0.4251207729468599
0.5966101694915255


In [381]:
#Decision Tree
#Q9) Create and train a Decision Tree model called Tree using the training data (x_train, y_train).
Tree = DecisionTreeClassifier().fit(x_train,y_train)


In [382]:
#Q10) Now use the predict method on the testing data (x_test) and save it to the array predictions.
predictions = np.asanyarray(Tree.predict(x_test))

In [383]:
#Q11) Using the predictions and the y_test dataframe calculate the value for each metric using the appropriate function.
Tree_Accuracy_Score = metrics.accuracy_score(y_test, predictions)
print(Tree_Accuracy_Score)
Tree_Jaccard_Index = metrics.jaccard_score(y_test, predictions)
print(Tree_Jaccard_Index)
Tree_F1_Score = metrics.f1_score(y_test, predictions)
print(Tree_F1_Score)

0.7572519083969466
0.40671641791044777
0.5782493368700266


In [384]:
#Q12) Use the train_test_split function to split the features and Y dataframes with a test_size of 0.2 and the random_state set to 1.
x_train, x_test, y_train, y_test = train_test_split(features, Y, test_size=0.2, random_state=1)

In [385]:
#Q13) Create and train a LogisticRegression model called LR using the training data (x_train, y_train) with the solver parameter set to liblinear.
LR = LogisticRegression(solver="liblinear").fit(x_train, y_train)

In [386]:
#Q14) Now, use the predict and predict_proba methods on the testing data (x_test) and save it as 2 arrays predictions and predict_proba.

predictions = np.asanyarray(LR.predict(x_test))
predict_proba = np.asanyarray(LR.predict_proba(x_test))

In [387]:
#Q15) Using the predictions, predict_proba and the y_test dataframe calculate the value for each metric using the appropriate function
LR_Accuracy_Score = metrics.accuracy_score(y_test, predictions)
LR_Jaccard_Index = metrics.jaccard_score(y_test, predictions)
LR_F1_Score = metrics.f1_score(y_test, predictions)
LR_Log_Loss = metrics.log_loss(y_test, predict_proba)
print(LR_Accuracy_Score, LR_Jaccard_Index, LR_F1_Score, LR_Log_Loss)

0.8366412213740458 0.5091743119266054 0.6747720364741642 0.38045106723472155


In [388]:
#SVM
#Q16) Create and train a SVM model called SVM using the training data (x_train, y_train).
SVM = svm.SVC(kernel="linear").fit(x_train, y_train)

In [389]:
#Q17) Now use the predict method on the testing data (x_test) and save it to the array predictions.
predictions = np.asanyarray(SVM.predict(x_test))

In [390]:
#Q18) Using the predictions and the y_test dataframe calculate the value for each metric using the appropriate function.
SVM_Accuracy_Score = metrics.accuracy_score(y_test, predictions)
SVM_JaccardIndex = metrics.jaccard_score(y_test, predictions)
SVM_F1_Score = metrics.f1_score(y_test, predictions)
print(SVM_Accuracy_Score, SVM_JaccardIndex, SVM_F1_Score)

0.8458015267175573 0.5345622119815668 0.6966966966966966


In [391]:
#Q19) Show the Accuracy,Jaccard Index,F1-Score and LogLoss in a tabular format using data frame for all of the above models.
from numpy import NaN


eval =  pd.DataFrame()

eval["Model_Name"] = ["KNN", "Decision_Tree", "LogisticRegression", "SVM"]
eval["Accuracy"] = [KNN_Accuracy_Score, Tree_Accuracy_Score, LR_Accuracy_Score, SVM_Accuracy_Score]
eval["Jaccard Index"] = [KNN_JaccardIndex, Tree_Jaccard_Index, LR_Jaccard_Index, SVM_JaccardIndex]
eval["F1-Score"] = [KNN_F1_Score, Tree_F1_Score, LR_F1_Score, SVM_F1_Score]
eval["Log Loss"] = [NaN, NaN, LR_Log_Loss, NaN]

print(eval.head())

           Model_Name  Accuracy  Jaccard Index  F1-Score  Log Loss
0                 KNN  0.818321       0.425121  0.596610       NaN
1       Decision_Tree  0.757252       0.406716  0.578249       NaN
2  LogisticRegression  0.836641       0.509174  0.674772  0.380451
3                 SVM  0.845802       0.534562  0.696697       NaN
