### Importing the required Libraries


In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, accuracy_score
import sklearn.metrics as metrics

### Importing the Dataset


In [2]:
import requests
# Step 1: Define the URL of the CSV file
url = 'https://shorturl.at/gZhaU'

# Step 2: Download the CSV file
response = requests.get(url)



In [3]:
# Step 3: Save the CSV file locally
csv_file_path = './Weather_Data (1).csv'
with open(csv_file_path, 'wb') as file:
    file.write(response.content)

# Step 4: Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path)

In [5]:
df.head()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2/1/2008,19.5,22.4,15.6,6.2,0.0,W,41,S,SSW,...,92,84,1017.6,1017.4,8,8,20.7,20.9,Yes,Yes
1,2/2/2008,19.5,25.6,6.0,3.4,2.7,W,41,W,E,...,83,73,1017.9,1016.4,7,7,22.4,24.8,Yes,Yes
2,2/3/2008,21.6,24.5,6.6,2.4,0.1,W,41,ESE,ESE,...,88,86,1016.7,1015.6,7,8,23.5,23.0,Yes,Yes
3,2/4/2008,20.2,22.8,18.8,2.2,0.0,W,41,NNE,E,...,83,90,1014.2,1011.8,8,8,21.4,20.9,Yes,Yes
4,2/5/2008,19.7,25.7,77.4,4.8,0.0,W,41,NNE,W,...,88,74,1008.3,1004.8,8,8,22.5,25.5,Yes,Yes


### Data Preprocessing


One hot encoding to convert categorical variables to binary variables.


In [6]:
df_sydney_processed = pd.get_dummies(data=df, columns=['RainToday', 'WindGustDir', 'WindDir9am', 'WindDir3pm'])

replace the values of the 'RainTomorrow' column changing them from a categorical column to a binary column. We do not use the `get_dummies` method because we would end up with two columns for 'RainTomorrow' and we do not want, since 'RainTomorrow' is our target.

In [7]:
df_sydney_processed.replace(['No', 'Yes'], [0,1], inplace=True)

  df_sydney_processed.replace(['No', 'Yes'], [0,1], inplace=True)



# Training Data and Test Data


In [8]:
df_sydney_processed.drop('Date',axis=1,inplace=True)
df_sydney_processed = df_sydney_processed.astype(float)
features = df_sydney_processed.drop(columns='RainTomorrow', axis=1)
Y = df_sydney_processed['RainTomorrow']

### Linear Regression


#### `train_test_split` function to split the `features` and `Y` dataframes with a `test_size` of `0.2` and the `random_state` set to `10`.


In [9]:
x_train, x_test, y_train, y_test = train_test_split( features, Y, test_size=0.2, random_state=10)
print ('Train set:', x_train.shape,  y_train.shape)
print ('Test set:', x_test.shape,  y_test.shape)


Train set: (2616, 66) (2616,)
Test set: (655, 66) (655,)


#### Training a Linear Regression model called LinearReg using the training data (`x_train`, `y_train`).


In [10]:
LinearReg = LinearRegression()
LinearReg.fit(x_train,y_train)

### Prediction


In [11]:
predictions = LinearReg.predict(x_test)

#### Calculating the value for each metric using the appropriate function.


In [12]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
LinearRegression_MAE = mean_absolute_error(y_test, predictions)
LinearRegression_MSE = mean_squared_error(y_test, predictions)
LinearRegression_R2 = r2_score(y_test, predictions)

#### MAE, MSE, and R2 in a tabular format using data frame for the linear model.


In [13]:
Report = pd.DataFrame({'ERRORS':['MAE','MSE','R2'],'VALUES':[LinearRegression_MAE, LinearRegression_MSE, LinearRegression_R2]})
Report

Unnamed: 0,ERRORS,VALUES
0,MAE,0.256318
1,MSE,0.115721
2,R2,0.427132


### KNN


In [14]:
KNN = KNeighborsClassifier(n_neighbors = 4).fit(x_train,y_train)
predictions = KNN.predict(x_test)

In [15]:
KNN_Accuracy_Score = accuracy_score(y_test, predictions)
KNN_JaccardIndex = jaccard_score(y_test, predictions)
KNN_F1_Score = f1_score(y_test, predictions)
KNN_Accuracy_Score

0.8183206106870229

### Decision Tree


In [16]:
Tree = DecisionTreeClassifier()
Tree.fit(x_train,y_train)
predictions = Tree.predict(x_test)

In [17]:
Tree_Accuracy_Score = accuracy_score(y_test, predictions)
Tree_JaccardIndex = jaccard_score(y_test, predictions)
Tree_F1_Score = f1_score(y_test, predictions)

### Logistic Regression
#### `train_test_split` function to split the `features` and `Y` dataframes with a `test_size` of `0.2` and the `random_state` set to `1`.


In [18]:
x_train, x_test, y_train, y_test = train_test_split( features, Y, test_size=0.2, random_state=1) 
LR = LogisticRegression(C=0.01, solver='liblinear').fit(x_train,y_train)
predictions = LR.predict(x_test)

In [19]:
LR_Accuracy_Score = accuracy_score(y_test, predictions)
LR_JaccardIndex = jaccard_score(y_test, predictions)
LR_F1_Score = f1_score(y_test, predictions)
predictions_prob = LR.predict_proba(x_test)
LR_Log_Loss = log_loss(y_test, predictions_prob)

### SVM


In [20]:
SVM = svm.SVC()
SVM.fit(x_train,y_train)
predictions = SVM.predict(x_test)

In [21]:
SVM_Accuracy_Score = accuracy_score(y_test, predictions)
SVM_JaccardIndex = jaccard_score(y_test, predictions)
SVM_F1_Score = f1_score(y_test, predictions)

### Report
#### Accuracy,Jaccard Index,F1-Score and LogLoss in a tabular format using data frame for all of the above models.

\*LogLoss is only for Logistic Regression Model


In [22]:
Report = pd.DataFrame({'Metrics':['Accuracy','Jaccard Index','F1-Score','LogLoss'],
                       'KNN':[KNN_Accuracy_Score,KNN_JaccardIndex,KNN_F1_Score,'NA'],
                       'Decision Tree':[Tree_Accuracy_Score,Tree_JaccardIndex,Tree_F1_Score,'NA'],
                       'Logistic Regression':[LR_Accuracy_Score,LR_JaccardIndex,LR_F1_Score,LR_Log_Loss],
                       'SVM':[SVM_Accuracy_Score, SVM_JaccardIndex, SVM_F1_Score,'NA']})
Report

Unnamed: 0,Metrics,KNN,Decision Tree,Logistic Regression,SVM
0,Accuracy,0.818321,0.741985,0.827481,0.722137
1,Jaccard Index,0.425121,0.376384,0.484018,0.0
2,F1-Score,0.59661,0.546917,0.652308,0.0
3,LogLoss,,,0.380085,
