# Project : Rain Prediction in Australia

## Weather Prediction Using Machine Learning: A Comprehensive Classification Analysis

### This project employs various classification algorithms—including Linear Regression, KNN, Decision Trees, Logistic Regression, and SVM—to predict weather conditions based on historical data. By evaluating model performance through accuracy, Jaccard Index, F1-Score, and LogLoss, we aim to identify the most effective approach for forecasting rain tomorrow.

In [1]:
# Surpress warnings:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [5]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, accuracy_score
import sklearn.metrics as metrics

## Importing the Dataset

In [7]:
import pandas as pd

# Define the URL
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillUp/labs/ML-FinalAssignment/Weather_Data.csv"

# Load the dataset directly into a DataFrame
df = pd.read_csv(url)

# Display the first few rows of the DataFrame
print(df.head())


       Date  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine WindGustDir  \
0  2/1/2008     19.5     22.4      15.6          6.2       0.0           W   
1  2/2/2008     19.5     25.6       6.0          3.4       2.7           W   
2  2/3/2008     21.6     24.5       6.6          2.4       0.1           W   
3  2/4/2008     20.2     22.8      18.8          2.2       0.0           W   
4  2/5/2008     19.7     25.7      77.4          4.8       0.0           W   

   WindGustSpeed WindDir9am WindDir3pm  ...  Humidity9am  Humidity3pm  \
0             41          S        SSW  ...           92           84   
1             41          W          E  ...           83           73   
2             41        ESE        ESE  ...           88           86   
3             41        NNE          E  ...           83           90   
4             41        NNE          W  ...           88           74   

   Pressure9am  Pressure3pm  Cloud9am  Cloud3pm  Temp9am  Temp3pm  RainToday  \
0       1017

In [9]:
#from pyodide.http import pyfetch

#async def download(url, filename):
#   response = await pyfetch(url)
#    if response.status == 200:
#        with open(filename, "wb") as f:
#            f.write(await response.bytes())
            

In [12]:
#await download(path, "Weather_Data.csv")
#filename ="Weather_Data.csv"

In [13]:
df.head()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2/1/2008,19.5,22.4,15.6,6.2,0.0,W,41,S,SSW,...,92,84,1017.6,1017.4,8,8,20.7,20.9,Yes,Yes
1,2/2/2008,19.5,25.6,6.0,3.4,2.7,W,41,W,E,...,83,73,1017.9,1016.4,7,7,22.4,24.8,Yes,Yes
2,2/3/2008,21.6,24.5,6.6,2.4,0.1,W,41,ESE,ESE,...,88,86,1016.7,1015.6,7,8,23.5,23.0,Yes,Yes
3,2/4/2008,20.2,22.8,18.8,2.2,0.0,W,41,NNE,E,...,83,90,1014.2,1011.8,8,8,21.4,20.9,Yes,Yes
4,2/5/2008,19.7,25.7,77.4,4.8,0.0,W,41,NNE,W,...,88,74,1008.3,1004.8,8,8,22.5,25.5,Yes,Yes


## Data Preprocessing

In [14]:
df_sydney_processed = pd.get_dummies(data=df, columns=['RainToday', 'WindGustDir', 'WindDir9am', 'WindDir3pm'])

In [15]:
df_sydney_processed.replace(['No', 'Yes'], [0,1], inplace=True)

## Training Data and Test Data

In [16]:
df_sydney_processed.drop('Date',axis=1,inplace=True)

In [17]:
df_sydney_processed = df_sydney_processed.astype(float)

In [18]:
features = df_sydney_processed.drop(columns='RainTomorrow', axis=1)
Y = df_sydney_processed['RainTomorrow']

## Linear Regression

### Q1) Use the train_test_split function to split the features and Y dataframes with a test_size of 0.2 and the random_state set to 10.

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset directly from the URL
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillUp/labs/ML-FinalAssignment/Weather_Data.csv"
df = pd.read_csv(url)

# Perform one-hot encoding
df = pd.get_dummies(data=df, columns=['RainToday', 'WindGustDir', 'WindDir9am', 'WindDir3pm'])

# Handle missing values by filling with median/mode
df.fillna(df.median(numeric_only=True), inplace=True)
df.fillna(df.mode().iloc[0], inplace=True)

# Define features and target variable
X = df.drop(columns=['RainTomorrow', 'Date'], errors='ignore')
y = df['RainTomorrow'].map({'Yes': 1, 'No': 0})

# Split the data into training and testing sets with a test_size of 0.2 and random_state set to 10
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

print("Data loaded and split successfully!")


Data loaded and split successfully!


In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, jaccard_score, f1_score

# Initialize the linear regression model
lin_reg = LinearRegression()

# Fit the model
lin_reg.fit(X_train, y_train)

# Make predictions
y_pred_lin_reg = lin_reg.predict(X_test)

# Since linear regression provides continuous output, we need to threshold it to get binary predictions
y_pred_lin_reg_binary = (y_pred_lin_reg >= 0.5).astype(int)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_lin_reg_binary)
jaccard = jaccard_score(y_test, y_pred_lin_reg_binary)
f1 = f1_score(y_test, y_pred_lin_reg_binary)
mae = mean_absolute_error(y_test, y_pred_lin_reg)
mse = mean_squared_error(y_test, y_pred_lin_reg)
r2 = r2_score(y_test, y_pred_lin_reg)

print("Linear Regression:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Jaccard Index: {jaccard:.2f}")
print(f"F1-Score: {f1:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R2-Score: {r2:.2f}")


Linear Regression:
Accuracy: 0.84
Jaccard Index: 0.51
F1-Score: 0.68
Mean Absolute Error: 0.26
Mean Squared Error: 0.12
R2-Score: 0.43


In [22]:
# Create a summary table if it doesn't exist
try:
    results
except NameError:
    results = pd.DataFrame(columns=["Model", "Accuracy", "Jaccard Index", "F1-Score", "LogLoss"])

# Append linear regression results to the summary table using pd.concat
new_result = pd.DataFrame([{
    "Model": "Linear Regression",
    "Accuracy": accuracy,
    "Jaccard Index": jaccard,
    "F1-Score": f1,
    "LogLoss": "N/A"  # LogLoss is not applicable for linear regression
}])

results = pd.concat([results, new_result], ignore_index=True)

results


Unnamed: 0,Model,Accuracy,Jaccard Index,F1-Score,LogLoss
0,Linear Regression,0.836641,0.513636,0.678679,


### Q2) Create and train a Linear Regression model called LinearReg using the training data (x_train, y_train)

In [23]:
#Enter Your Code and Efrom sklearn.linear_model import LinearRegression

# Initialize the Linear Regression model
LinearReg = LinearRegression()

# Train the model using the training data
LinearReg.fit(X_train, y_train)


In [24]:
print(f"Coefficients: {LinearReg.coef_}")
print(f"Intercept: {LinearReg.intercept_}")


Coefficients: [-2.36946709e-02  1.30528498e-02  7.30741811e-04  6.48086479e-03
 -3.51665228e-02  4.23576504e-03  1.83177620e-03  7.91415854e-04
  9.59311783e-04  8.55637076e-03  7.70190836e-03 -9.24845557e-03
 -8.86878011e-03  1.00413073e-02  1.44858937e-02 -3.54592115e-03
 -7.74520021e+10 -7.74520021e+10  2.11152148e+09  2.11152148e+09
  2.11152148e+09  2.11152148e+09  2.11152148e+09  2.11152148e+09
  2.11152148e+09  2.11152148e+09  2.11152148e+09  2.11152148e+09
  2.11152148e+09  2.11152148e+09  2.11152148e+09  2.11152148e+09
  2.11152148e+09  2.11152148e+09  7.65756278e+09  7.65756278e+09
  7.65756278e+09  7.65756278e+09  7.65756278e+09  7.65756278e+09
  7.65756278e+09  7.65756278e+09  7.65756278e+09  7.65756278e+09
  7.65756278e+09  7.65756278e+09  7.65756278e+09  7.65756278e+09
  7.65756278e+09  7.65756278e+09 -9.48859411e+09 -9.48859411e+09
 -9.48859411e+09 -9.48859411e+09 -9.48859411e+09 -9.48859411e+09
 -9.48859411e+09 -9.48859411e+09 -9.48859411e+09 -9.48859411e+09
 -9.4885941

In [25]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, jaccard_score, f1_score

# Make predictions using the testing data
y_pred_lin_reg = LinearReg.predict(X_test)

# Since linear regression provides continuous output, we need to threshold it to get binary predictions
y_pred_lin_reg_binary = (y_pred_lin_reg >= 0.5).astype(int)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_lin_reg_binary)
jaccard = jaccard_score(y_test, y_pred_lin_reg_binary)
f1 = f1_score(y_test, y_pred_lin_reg_binary)
mae = mean_absolute_error(y_test, y_pred_lin_reg)
mse = mean_squared_error(y_test, y_pred_lin_reg)
r2 = r2_score(y_test, y_pred_lin_reg)

print("Linear Regression:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Jaccard Index: {jaccard:.2f}")
print(f"F1-Score: {f1:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R2-Score: {r2:.2f}")


Linear Regression:
Accuracy: 0.84
Jaccard Index: 0.51
F1-Score: 0.68
Mean Absolute Error: 0.26
Mean Squared Error: 0.12
R2-Score: 0.43


In [26]:
# Create a summary table if it doesn't exist
try:
    results
except NameError:
    results = pd.DataFrame(columns=["Model", "Accuracy", "Jaccard Index", "F1-Score", "LogLoss"])

# Append linear regression results to the summary table using pd.concat
new_result = pd.DataFrame([{
    "Model": "Linear Regression",
    "Accuracy": accuracy,
    "Jaccard Index": jaccard,
    "F1-Score": f1,
    "LogLoss": "N/A"  # LogLoss is not applicable for linear regression
}])

results = pd.concat([results, new_result], ignore_index=True)

results


Unnamed: 0,Model,Accuracy,Jaccard Index,F1-Score,LogLoss
0,Linear Regression,0.836641,0.513636,0.678679,
1,Linear Regression,0.836641,0.513636,0.678679,


### Q3) Now use the predict method on the testing data (x_test) and save it to the array predictions.¶

In [27]:
#Enter # Use the predict method on the testing data (X_test) to make predictions
predictions = LinearReg.predict(X_test)

# Print the predictions
print(predictions)


[ 1.31927490e-01  2.76077271e-01  9.78149414e-01  2.87460327e-01
  1.32339478e-01  4.60342407e-01  3.56857300e-01  8.56338501e-01
  6.75018311e-01  3.84063721e-02  4.85229492e-03  2.81280518e-01
  3.39141846e-01  7.80792236e-02  6.25915527e-02  5.64392090e-01
 -6.14166260e-02  5.24108887e-01  1.53717041e-01  3.59695435e-01
  6.05468750e-02  9.03564453e-01  4.67544556e-01  2.03338623e-01
 -7.10449219e-02  3.83758545e-01  5.36071777e-01 -2.29034424e-02
  6.40258789e-01 -9.57489014e-02  3.78128052e-01  1.20330811e-01
 -1.81579590e-02  5.54809570e-02  5.63461304e-01  1.06292725e+00
 -6.77490234e-03  5.14404297e-01 -8.83789062e-02  6.92138672e-02
  2.45208740e-02  8.71704102e-01  2.44689941e-01  3.94775391e-01
  2.67593384e-01  4.46807861e-01 -4.75158691e-02  1.89468384e-01
  7.76611328e-01  1.57775879e-01  3.86047363e-03 -5.19561768e-02
  2.07412720e-01 -2.07794189e-01 -7.62786865e-02  2.49572754e-01
  2.79281616e-01  6.02783203e-01  6.29592896e-01  4.90631104e-01
  5.66406250e-02  1.05422

### Q4) Using the predictions and the y_test dataframe calculate the value for each metric using the appropriate function.

In [28]:
from sklearn.metrics import (
    accuracy_score,
    jaccard_score,
    f1_score,
    mean_absolute_error,
    mean_squared_error,
    r2_score
)

# Threshold predictions to get binary outcomes
y_pred_binary = (predictions >= 0.5).astype(int)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred_binary)
jaccard = jaccard_score(y_test, y_pred_binary)
f1 = f1_score(y_test, y_pred_binary)
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

# Print the results
print("Evaluation Metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Jaccard Index: {jaccard:.2f}")
print(f"F1-Score: {f1:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R2-Score: {r2:.2f}")


Evaluation Metrics:
Accuracy: 0.84
Jaccard Index: 0.51
F1-Score: 0.68
Mean Absolute Error: 0.26
Mean Squared Error: 0.12
R2-Score: 0.43


### Q5) Show the MAE, MSE, and R2 in a tabular format using data frame for the linear model.

In [29]:
#Enter Yourimport pandas as pd

# Create a DataFrame to hold the metrics
linear_metrics = pd.DataFrame({
    "Metric": ["Mean Absolute Error", "Mean Squared Error", "R2 Score"],
    "Value": [mae, mse, r2]
})

# Display the metrics DataFrame
print(linear_metrics)


                Metric     Value
0  Mean Absolute Error  0.256325
1   Mean Squared Error  0.115723
2             R2 Score  0.427119


## KNN

### Q6) Create and train a KNN model called KNN using the training data (x_train, y_train) with the n_neighbors parameter set to 4

In [31]:
#Enter Your Code and Execute
from sklearn.neighbors import KNeighborsClassifier

# Initialize the KNN model with n_neighbors set to 4
KNN = KNeighborsClassifier(n_neighbors=4)

# Train the model using the training data
KNN.fit(X_train, y_train)


### Q7) Now use the predict method on the testing data (x_test) and save it to the array predictions.

In [32]:
#Enter # Use the predict method on the testing data (X_test) to make predictions
predictions = KNN.predict(X_test)

# Print the predictions
print(predictions)


[0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0
 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 1 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0
 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 1 0 0 0 0 0 1 0 1 0 1 1 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 1 0 0 1 0 0 0 0
 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0
 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0
 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

### Q8) Using the predictions and the y_test dataframe calculate the value for each metric using the appropriate function.

In [33]:
#Enter Your Code and Execute
from sklearn.metrics import (
    accuracy_score,
    jaccard_score,
    f1_score,
    mean_absolute_error,
    mean_squared_error,
    r2_score
)

# Calculate metrics
accuracy = accuracy_score(y_test, predictions)
jaccard = jaccard_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

# Print the results
print("KNN Evaluation Metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Jaccard Index: {jaccard:.2f}")
print(f"F1-Score: {f1:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R2-Score: {r2:.2f}")


KNN Evaluation Metrics:
Accuracy: 0.82
Jaccard Index: 0.43
F1-Score: 0.60
Mean Absolute Error: 0.18
Mean Squared Error: 0.18
R2-Score: 0.10


## Decision Tree

### Q9) Create and train a Decision Tree model called Tree using the training data (x_train, y_train).

In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score,
    jaccard_score,
    f1_score,
    mean_absolute_error,
    mean_squared_error,
    r2_score
)


In [35]:
#Enterfrom sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree model
Tree = DecisionTreeClassifier()

# Train the model using the training data
Tree.fit(X_train, y_train)


### Q10) Now use the predict method on the testing data (x_test) and save it to the array predictions.

In [36]:
#Enter # Use the predict method on the testing data (X_test) to make predictions
predictions = Tree.predict(X_test)

# Print the predictions
print(predictions)


[0 0 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 1 0 1 0 1 1
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 1 1 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0
 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 1 0
 0 0 0 1 1 0 0 1 0 0 1 1 1 0 0 1 1 0 1 1 0 0 0 0 1 0 0 1 0 0 0 1 0 1 1 0 1
 1 1 0 1 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 1 0 0 0 0 0 1 0 1 0 1 1 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 1 1 1 1 0 1 0 1
 1 0 1 1 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 1 0 0 0
 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 1
 1 1 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 0 1 0 1
 0 1 0 0 0 0 0 0 0 0 0 0 

### Q11) Using the predictions and the y_test dataframe calculate the value for each metric using the appropriate function

In [37]:
#Enter Y# Calculate metrics
accuracy = accuracy_score(y_test, predictions)
jaccard = jaccard_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

# Print the results
print("Decision Tree Evaluation Metrics:")
print(f"Tree_Accuracy: {accuracy:.2f}")
print(f"Tree_Jaccard Index: {jaccard:.2f}")
print(f"Tree_F1-Score: {f1:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R2-Score: {r2:.2f}")


Decision Tree Evaluation Metrics:
Tree_Accuracy: 0.74
Tree_Jaccard Index: 0.38
Tree_F1-Score: 0.55
Mean Absolute Error: 0.26
Mean Squared Error: 0.26
R2-Score: -0.28


## Logistic Regression

### Q12) Use the train_test_split function to split the features and Y dataframes with a test_size of 0.2 and the random_state set to 1

In [38]:
#Enterfrom sklearn.model_selection import train_test_split

# Define features and target variable again if needed
X = df.drop(columns=['RainTomorrow', 'Date'], errors='ignore')  # Drop 'Date' if it exists
y = df['RainTomorrow'].map({'Yes': 1, 'No': 0})

# Use train_test_split with test_size of 0.2 and random_state set to 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


### Q13) Create and train a LogisticRegression model called LR using the training data (x_train, y_train) with the solver parameter set to liblinear.

In [41]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model with solver set to 'liblinear'
LR = LogisticRegression(solver='liblinear')

# Train the model using the training data
LR.fit(X_train, y_train)
#Enter Your Code and Execute

### Q14) Now, use the predict and predict_proba methods on the testing data (x_test) and save it as 2 arrays predictions and predict_proba.

In [42]:
#Enter# Use the predict method to make predictions
predictions = LR.predict(X_test)

# Use the predict_proba method to get probability estimates
predict_proba = LR.predict_proba(X_test)

# Print the predictions and predicted probabilities
print("Predictions:", predictions)
print("Predicted Probabilities:", predict_proba)


Predictions: [0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 1 1 1 1
 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0
 0 1 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 1
 1 0 0 0 1 0 0 1 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1
 1 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1
 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0
 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 1 1 0 0 0 0 1
 0 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1
 0 0 0 0 0 0 0 1 1 0 0 1 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0
 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0
 0 0 0 0 0 1

### Q15) Using the predictions, predict_proba and the y_test dataframe calculate the value for each metric using the appropriate function.

In [43]:

from sklearn.metrics import (
    accuracy_score,
    jaccard_score,
    f1_score,
    log_loss,
    mean_absolute_error,
    mean_squared_error,
    r2_score
)

# Calculate metrics
accuracy = accuracy_score(y_test, predictions)
jaccard = jaccard_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
logloss = log_loss(y_test, predict_proba)

# Print the results
print("Logistic Regression Evaluation Metrics:")
print(f"LR_Accuracy: {accuracy:.2f}")
print(f"LR_Jaccard Index: {jaccard:.2f}")
print(f"LR_F1-Score: {f1:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R2-Score: {r2:.2f}")
print(f"LR_LogLoss: {logloss:.2f}")


Logistic Regression Evaluation Metrics:
LR_Accuracy: 0.83
LR_Jaccard Index: 0.50
LR_F1-Score: 0.67
Mean Absolute Error: 0.17
Mean Squared Error: 0.17
R2-Score: 0.17
LR_LogLoss: 0.38


## SVM

### Q16) Create and train a SVM model called SVM using the training data (x_train, y_train).

In [44]:

from sklearn import svm

# Initialize the SVM model
SVM = svm.SVC(probability=True)  # Set probability=True for predict_proba

# Train the model using the training data
SVM.fit(X_train, y_train)


### Q17) Now use the predict method on the testing data (x_test) and save it to the array predictions.

In [45]:
#Enter Your Code and Execute# Use the predict method on the testing data (X_test) to make predictions
predictions = SVM.predict(X_test)

# Print the predictions
print(predictions)



[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

### Q18) Using the predictions and the y_test dataframe calculate the value for each metric using the appropriate function.

In [46]:
from sklearn.metrics import (
    accuracy_score,
    jaccard_score,
    f1_score,
    log_loss,
    mean_absolute_error,
    mean_squared_error,
    r2_score
)

# Calculate metrics
accuracy = accuracy_score(y_test, predictions)
jaccard = jaccard_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

# For LogLoss, you need to calculate probabilities
predict_proba = SVM.predict_proba(X_test)
logloss = log_loss(y_test, predict_proba)

# Print the results
print("SVM Evaluation Metrics:")
print(f"SVM_Accuracy: {accuracy:.2f}")
print(f"SVM_Jaccard Index: {jaccard:.2f}")
print(f"SVM_F1-Score: {f1:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R2-Score: {r2:.2f}")
print(f"LogLoss: {logloss:.2f}")


SVM Evaluation Metrics:
SVM_Accuracy: 0.72
SVM_Jaccard Index: 0.00
SVM_F1-Score: 0.00
Mean Absolute Error: 0.28
Mean Squared Error: 0.28
R2-Score: -0.38
LogLoss: 0.42


## Report

### Q19) Show the Accuracy,Jaccard Index,F1-Score and LogLoss in a tabular format using data frame for all of the above models.
*LogLoss is only for Logistic Regression Model

In [54]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score, jaccard_score, f1_score, log_loss

# Load the dataset directly from the URL
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillUp/labs/ML-FinalAssignment/Weather_Data.csv"
df = pd.read_csv(url)

# Perform one-hot encoding
df = pd.get_dummies(data=df, columns=['RainToday', 'WindGustDir', 'WindDir9am', 'WindDir3pm'])

# Handle missing values
df.fillna(df.median(numeric_only=True), inplace=True)
df.fillna(df.mode().iloc[0], inplace=True)

# Define features and target variable
X = df.drop(columns=['RainTomorrow', 'Date'], errors='ignore')
y = df['RainTomorrow'].map({'Yes': 1, 'No': 0})

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)


In [55]:
#Linear Regression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lin_reg = lin_reg.predict(X_test)
y_pred_lin_reg_binary = (y_pred_lin_reg >= 0.5).astype(int)

accuracy_lr = accuracy_score(y_test, y_pred_lin_reg_binary)
jaccard_lr = jaccard_score(y_test, y_pred_lin_reg_binary)
f1_lr = f1_score(y_test, y_pred_lin_reg_binary)
logloss_lr = "N/A"


In [56]:
#KNN

knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

accuracy_knn = accuracy_score(y_test, y_pred_knn)
jaccard_knn = jaccard_score(y_test, y_pred_knn)
f1_knn = f1_score(y_test, y_pred_knn)


In [57]:
#Decision Tree

tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)

accuracy_tree = accuracy_score(y_test, y_pred_tree)
jaccard_tree = jaccard_score(y_test, y_pred_tree)
f1_tree = f1_score(y_test, y_pred_tree)


In [58]:
#Logictic Regression

LR = LogisticRegression(solver='liblinear')
LR.fit(X_train, y_train)
y_pred_lr = LR.predict(X_test)
predict_proba_lr = LR.predict_proba(X_test)[:, 1]

accuracy_lr = accuracy_score(y_test, y_pred_lr)
jaccard_lr = jaccard_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)
logloss = log_loss(y_test, predict_proba_lr)


In [59]:
#SVM

SVM = svm.SVC()
SVM.fit(X_train, y_train)
y_pred_svm = SVM.predict(X_test)

accuracy_svm = accuracy_score(y_test, y_pred_svm)
jaccard_svm = jaccard_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)


In [61]:
models_results = [
    {"Model": "Linear Regression", "Accuracy": accuracy_lr, "Jaccard Index": jaccard_lr, "F1-Score": f1_lr, "LogLoss": logloss_lr},
    {"Model": "KNN", "Accuracy": accuracy_knn, "Jaccard Index": jaccard_knn, "F1-Score": f1_knn, "LogLoss": "N/A"},
    {"Model": "Decision Tree", "Accuracy": accuracy_tree, "Jaccard Index": jaccard_tree, "F1-Score": f1_tree, "LogLoss": "N/A"},
    {"Model": "Logistic Regression", "Accuracy": accuracy_lr, "Jaccard Index": jaccard_lr, "F1-Score": f1_lr, "LogLoss": logloss},
    {"Model": "SVM", "Accuracy": accuracy_svm, "Jaccard Index": jaccard_svm, "F1-Score": f1_svm, "LogLoss": "N/A"}
]

# Create a DataFrame to display the results
results = pd.DataFrame(models_results)

# Display the results
print(results)


                 Model  Accuracy  Jaccard Index  F1-Score  LogLoss
0    Linear Regression  0.836641       0.515837  0.680597      N/A
1                  KNN  0.818321       0.425121  0.596610      N/A
2        Decision Tree  0.748092       0.393382  0.564644      N/A
3  Logistic Regression  0.836641       0.515837  0.680597  0.35718
4                  SVM  0.719084       0.000000  0.000000      N/A
