In [1]:
# importing necessary packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Machine Learning models
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Evaluation Metrices
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Handle warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# load dataset
data = pd.read_csv("Dataset.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'Dataset.csv'

# Preprocessing Section

In [None]:
# watching five first rows columns
data.sample(30)

In [None]:
# geting an overview in our dataset
data.info()

In [None]:
# if we want to work with "Date" feature, we have to convert it into year, month and day instead of "str"
data["Date"] = pd.to_datetime(data["Date"])

data["Year"] = data["Date"].dt.year
data["Month"] = data["Date"].dt.month
data["Day"] = data["Date"].dt.day

data.drop(["Date"], axis=1, inplace=True)

In [None]:
# The "unnamed: 0" feature has no effect on Rain forecast and has only an indexing aspect.
data.drop(["Unnamed: 0"], axis=1, inplace=True)

Convert all Nominal features to numeric
note: In this part, we convert categorical features into numerical features by labeling method. Because we want to perform visual analysis on it. by knowing that these features aren't ordinal and should be converted to numerical features by one-hot encoding method.

In [None]:
# The "Weather Station" feature has no effect on Rain forecast.
data.drop(["Weather Station"], axis=1, inplace=True)

In [None]:
analysis_data = data.copy(deep=True)

In [None]:
# How many different categories do we have:
list(analysis_data["Gust Trajectory at  3 PM"].unique())

In [None]:
# All "Gust Trajectory", "Gust Trajectory at  3 PM" and "Gust Trajectory at  9 PM" have 16 different categories
analysis_data["Gust Trajectory"].replace({"WNW": 0, "WSW": 1, "E": 3, "NW": 4, "W":5, "SSE": 6, "ESE": 7, "ENE": 8, "NNW": 9, "SSW": 10, "SW": 11, "SE": 12, "N": 13, "S": 14, "NNE": 15, "NE": 16}, inplace=True)

In [None]:
analysis_data["Gust Trajectory at  3 PM"].replace({"WNW": 0, "WSW": 1, "E": 3, "NW": 4, "W":5, "SSE": 6, "ESE": 7, "ENE": 8, "NNW": 9, "SSW": 10, "SW": 11, "SE": 12, "N": 13, "S": 14, "NNE": 15, "NE": 16}, inplace=True)

In [None]:
analysis_data["Gust Trajectory at  9 AM"].replace({"WNW": 0, "WSW": 1, "E": 3, "NW": 4, "W":5, "SSE": 6, "ESE": 7, "ENE": 8, "NNW": 9, "SSW": 10, "SW": 11, "SE": 12, "N": 13, "S": 14, "NNE": 15, "NE": 16}, inplace=True)

In [None]:
# "Rain that day" and "Rain the day after" have two different categories
analysis_data["Rain the day after"].replace({"Yes": 1, "No": 0}, inplace=True)

In [None]:
analysis_data["Rain that day"].replace({"Yes": 1, "No": 0}, inplace=True)

We try to check the correlation and the degree of correlation between the data using the heatmap diagram. In such a situation, we can identify features that are exactly the same or are so correlated that they can be used interchangeably, and remove one of them at will.

In [None]:
corr = analysis_data.corr()

plt.figure(figsize=(100, 15))
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', square=True, linewidths=.5)
plt.title("Correlation Heatmap of DataFrame Features")
plt.show()

According to the correlation plot, there is a direct relationship between the maximum and minimum temperature and also the reporting hours. It can be taken like this, the minimum temperature of each day is announced at 9:00 am and the maximum temperature of each day is announced at 3:00 pm. So, we can remove the two characteristics of reporting hours at 9 am and 3 pm. Because these two features are practically the same as the minimum and maximum temperature

In [None]:
data.drop(["Recorded Temperature at 3 PM", "Recorded Temperature at 9 AM"], axis=1, inplace=True)

In [None]:
avg_rainfall_per_month = analysis_data.groupby("Month")["Rainfall"].mean().reset_index()

plt.figure(figsize=(10, 6))
sns.barplot(x='Month', y='Rainfall', data=avg_rainfall_per_month)
plt.title('Average Rainfall by Milimeter per month')
plt.xlabel('Month')
plt.ylabel('Average Rainfall by Milimeter')
plt.show()

As it is clear from the bar plot of the average rainfall in each month, this amount is higher in the cold months of the year (winter and fall) than in the summer months, so current month can also be effective in predicting the rainfall.

Now it's time to check the missing values (NaN).

In [None]:
null_counts = data.isna().sum()

null_counts.sort_values(ascending=False)

Don't consider features with more than 50,000 NaNs

In [None]:
data.drop(["Sunshine", "Evaporation", "Cloudiness at 3 PM", "Cloudiness at 9 AM", "Atmospheric Pressure at 3 PM"], axis=1, inplace=True)

In features where the number of NaNs is between 7,000 and 50,000, we replace them with the average or mode of that feature.

In [None]:
features = ["Atmospheric Pressure at 9 AM", "Air Velocity"]
imputer = SimpleImputer(strategy="mean")
data[features] = imputer.fit_transform(data[features])

In features where the number of NaNs is below 5000, we delete the corresponding data from the dataset.

In [None]:
for feature in ["Minimum Temperature", "Maximum Temperature", "Rainfall", "Gust Trajectory at  3 PM", "Air Velocity at  9 AM", "Air Velocity at  3 PM", "Moisture Level at 9 AM", "Moisture Level at 3 PM", "Rain that day", "Rain the day after"]:
    data.dropna(subset=feature, inplace=True)

## Model Selection

Now, in this part, we convert categorical features that aren't ordinal into numerical features using the One-Hot Encoding method.

In [None]:
data = pd.get_dummies(data, columns=["Gust Trajectory", "Gust Trajectory at  3 PM", "Gust Trajectory at  9 AM", "Rain that day"], dtype=int)

In [None]:
data["Rain the day after"].replace({"Yes":1, "No":0}, inplace=True)

In [None]:
X = data.drop(["Rain the day after"], axis=1)
y = data["Rain the day after"]

In [None]:
scaler = StandardScaler()

scaled_X = scaler.fit_transform(X)

### Resampling

In [None]:
oversample = RandomOverSampler(sampling_strategy="minority")
X_over, y_over = oversample.fit_resample(scaled_X, y)

In [None]:
X_over.shape

In [None]:
y_over.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, random_state=31)

## Creating Decision Tree model

In [None]:
# we are creating our decision tree model and training that.
DTs_model = DecisionTreeClassifier()

DTs_model.fit(X_train, y_train)

In [None]:
# Now we give the testing part to the model to predict whether we will have rain in the next day or not according to the values of the features.
y_pred = DTs_model.predict(X_test)

### Evaluation our model

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
precision_score(y_test, y_pred)

In [None]:
recall_score(y_test, y_pred)

In [None]:
f1_score(y_test, y_pred)

## Feature Selection

Feature Selection using decision tree:
The decision tree selects the most important available feature based on the information gain and the entropy criterion in terms of the best separation for the separation of the current node.
From this point of view, we can use this method to select the most important features.

In [None]:
features_importances = pd.DataFrame({"Feature": X.columns, "Importance": DTs_model.feature_importances_}).sort_values(by="Importance", ascending=False).head(16)

In [None]:
features_importances["Feature"].tolist()

We select 16 of the most important features and start training the models:

In [None]:
data = data[features_importances["Feature"].tolist()]

Using new dataset with new and important features

In [None]:
X = data

In [None]:
scaler = StandardScaler()

scaled_X = scaler.fit_transform(X)

Our target values are imbalanced. in such case we have to balance them. we used resampling method for doing that

In [None]:
oversample = RandomOverSampler(sampling_strategy="minority")
X_over, y_over = oversample.fit_resample(scaled_X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, random_state=31)

# Creating model section

### KNN

### Creating and Training model

In [None]:
# we are creating our K Nearest Neighbors model and training that.
KNN_model = KNeighborsClassifier()

KNN_model.fit(X_train, y_train)

In [None]:
# Now we give the testing part to the model to predict whether we will have rain in the next day or not according to the values of the features.
y_pred = KNN_model.predict(X_test)

### Evaluation our model

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
precision_score(y_test, y_pred)

In [None]:
recall_score(y_test, y_pred)

In [None]:
f1_score(y_test, y_pred)

## SVM

### Creating and Training model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, train_size=40000, test_size=20000, random_state=31)

In [None]:
# we are creating our Suport Vector Machine model and training that.
SVM_model = SVC(kernel="linear")

SVM_model.fit(X_train, y_train)

In [None]:
y_pred = SVM_model.predict(X_test)

### Evaluation Model

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
precision_score(y_test, y_pred)

In [None]:
recall_score(y_test, y_pred)

In [None]:
f1_score(y_test, y_pred)