In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
df=pd.read_csv('weatherAUS.csv')
df.head(3)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No


In [3]:
# listing all the categorical attributes in the dataframe for one hot encoding
cat = df.select_dtypes(include=['object', 'category']).columns
cat

Index(['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm',
       'RainToday', 'RainTomorrow'],
      dtype='object')

In [4]:
# preprocessing the data
# Encode Categorical Variables
df= pd.get_dummies(df, columns=cat, drop_first=True)

In [5]:
#checking for null values
df.isnull().sum().sum()

311600

In [6]:
# replacing the nan values with the mean of the column
df=df.fillna(df.mean())

In [8]:
df.isnull().sum().sum()

0

In [9]:
df.head(2)

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW,RainToday_Yes,RainTomorrow_Yes
0,13.4,22.9,0.6,5.468232,7.611178,44.0,20.0,24.0,71.0,22.0,...,0,0,0,0,0,0,1,0,0,0
1,7.4,25.1,0.0,5.468232,7.611178,44.0,4.0,22.0,44.0,25.0,...,0,0,0,0,0,0,0,1,0,0


In [10]:
# spliting the data for feature (x) and target (y) variable
x=df.drop(['RainTomorrow_Yes'],axis=1)
y=df['RainTomorrow_Yes']

In [11]:
# introducing a bias term
x=np.column_stack((np.zeros(len(x)),x))

In [12]:
# Normalize the feature
x=StandardScaler().fit_transform(x)

In [13]:
# split the data into test data and training data
x,x_test,y,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [15]:
# Logistic function
def sigmoid(z):
    return 1/(1+np.exp(-z))

In [16]:
# define the cost function
def cost(y,ycap):
    m=y.shape[0]
    return -np.sum(y*np.log(ycap)+(1-y)*np.log(1-ycap))/m

In [19]:
# define the gradient descent function
def gradient(x,y,ycap,w,b,alpha):
    m=y.shape[0]
    dw=(1/m)*np.dot(x.T,(ycap-y))
    db=(1/m)*sum(y-ycap)
    return w-(alpha*dw),b-(alpha*db)

In [20]:
# define the function to train the model
def train(x,y,num,alpha):
    w=np.zeros(x.shape[1])
    b=0
    for _ in range(num):
        z=np.dot(x,w)+b
        ycap=sigmoid(z)
        loss=cost(y,ycap)
        w,b=gradient(x,y,ycap,w,b,alpha)
    return w,b

In [21]:
# define the prediction function
def predict(x,w,b):
    z=np.dot(x,w)+b
    ycap=sigmoid(z)
    return ycap

In [23]:
# training the model
num=1000
alpha=0.01
w_final,b_final=train(x,y,num,alpha)


In [24]:
# testing the test sample
y_prediction=predict(x_test,w_final,b_final)

In [33]:
y_test=y_test.astype(int)
y_test=np.round(y_test).astype(int)
y_prediction=y_prediction.astype(int)
y_prediction=np.round(y_prediction).astype(int)

In [34]:
from sklearn.metrics import accuracy_score
acc=accuracy_score(y_test,y_prediction)
acc

0.7793551491819056