## Importing libraries

In [1]:
import numpy as np
import pandas as pd

## Importing Dataset

In [12]:
data = pd.read_csv("weatherAUS.csv")
X = data.iloc[:,[1,2,3,4,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21]].values
Y = data.iloc[:,-1].values

In [15]:
print(X)

[['Albury' 13.4 22.9 ... 16.9 21.8 'No']
 ['Albury' 7.4 25.1 ... 17.2 24.3 'No']
 ['Albury' 12.9 25.7 ... 21.0 23.2 'No']
 ...
 ['Uluru' 5.4 26.9 ... 12.5 26.1 'No']
 ['Uluru' 7.8 27.0 ... 15.1 26.0 'No']
 ['Uluru' 14.9 nan ... 15.0 20.9 'No']]


In [16]:
print(Y)

['No' 'No' 'No' ... 'No' 'No' nan]


In [18]:
Y = Y.reshape(-1,1) #changing 1D to 2D list

In [19]:
Y

array([['No'],
       ['No'],
       ['No'],
       ...,
       ['No'],
       ['No'],
       [nan]], dtype=object)

## Data Processing

In [20]:
#replacing the Nan values with the most frequent values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy='most_frequent')
X = imputer.fit_transform(X)
Y = imputer.fit_transform(Y)


In [21]:
X

array([['Albury', 13.4, 22.9, ..., 16.9, 21.8, 'No'],
       ['Albury', 7.4, 25.1, ..., 17.2, 24.3, 'No'],
       ['Albury', 12.9, 25.7, ..., 21.0, 23.2, 'No'],
       ...,
       ['Uluru', 5.4, 26.9, ..., 12.5, 26.1, 'No'],
       ['Uluru', 7.8, 27.0, ..., 15.1, 26.0, 'No'],
       ['Uluru', 14.9, 20.0, ..., 15.0, 20.9, 'No']], dtype=object)

In [22]:
Y

array([['No'],
       ['No'],
       ['No'],
       ...,
       ['No'],
       ['No'],
       ['No']], dtype=object)

In [23]:
from sklearn.preprocessing import LabelEncoder
LE1 = LabelEncoder()
X[:,0] = LE1.fit_transform(X[:,0])
LE2 = LabelEncoder()
X[:,4] = LE2.fit_transform(X[:,4])
LE3 = LabelEncoder()
X[:,6] = LE3.fit_transform(X[:,6])
LE4 = LabelEncoder()
X[:,7] = LE4.fit_transform(X[:,7])
LE5 = LabelEncoder()
X[:,-1] = LE5.fit_transform(X[:,-1])
LE6 = LabelEncoder()
Y = LE6.fit_transform(Y)

  y = column_or_1d(y, warn=True)


In [24]:
X

array([[2, 13.4, 22.9, ..., 16.9, 21.8, 0],
       [2, 7.4, 25.1, ..., 17.2, 24.3, 0],
       [2, 12.9, 25.7, ..., 21.0, 23.2, 0],
       ...,
       [41, 5.4, 26.9, ..., 12.5, 26.1, 0],
       [41, 7.8, 27.0, ..., 15.1, 26.0, 0],
       [41, 14.9, 20.0, ..., 15.0, 20.9, 0]], dtype=object)

In [25]:
Y

array([0, 0, 0, ..., 0, 0, 0])

## Feature Scaling

In [27]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

In [28]:
X

array([[-1.53166617,  0.19132753, -0.04135977, ..., -0.01407077,
         0.02310362, -0.52979545],
       [-1.53166617, -0.75105231,  0.26874452, ...,  0.03244663,
         0.387799  , -0.52979545],
       [-1.53166617,  0.11279588,  0.35331842, ...,  0.62166712,
         0.22733303, -0.52979545],
       ...,
       [ 1.20928479, -1.06517892,  0.52246622, ..., -0.69632607,
         0.65037966, -0.52979545],
       [ 1.20928479, -0.68822699,  0.53656187, ..., -0.29317521,
         0.63579185, -0.52979545],
       [ 1.20928479,  0.42692249, -0.45013361, ..., -0.30868102,
        -0.10818671, -0.52979545]])

## Splitting the Dataset into Trainingset and Testset

In [30]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.25,random_state=0)

In [31]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100,random_state=0)
classifier.fit(X_train,Y_train)


RandomForestClassifier(random_state=0)

In [32]:
classifier.score(X_train,Y_train)

0.9999450020624226

In [33]:
y_pred = classifier.predict(X_test)

In [34]:
print(y_pred)

[0 0 0 ... 0 1 0]


In [35]:
#since Y is encoded its values are in the form of 0's and 1's
#converting 0's and 1's into "Yes" or "No" using inverse transform
y_pred = LE6.inverse_transform(y_pred)

In [36]:
print(y_pred)

['No' 'No' 'No' ... 'No' 'Yes' 'No']


In [37]:
Y_test = LE6.inverse_transform(Y_test)

In [38]:
Y_test

array(['Yes', 'Yes', 'No', ..., 'No', 'Yes', 'No'], dtype=object)

In [39]:
y_pred = y_pred.reshape(-1,1)
Y_test = Y_test.reshape(-1,1)

In [41]:
df = np.concatenate((Y_test,y_pred),axis=1)
dataframe = pd.DataFrame(df,columns=["Rains on Tomorrow", "Prediction of Rain"])

In [42]:
dataframe

Unnamed: 0,Rains on Tomorrow,Prediction of Rain
0,Yes,No
1,Yes,No
2,No,No
3,No,Yes
4,No,No
...,...,...
36360,Yes,No
36361,No,No
36362,No,No
36363,Yes,Yes


In [43]:
dataframe.to_csv("Rain_prediction.csv")

## Calculating accuracy

In [44]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test,y_pred)

0.8544754571703561