In [40]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn import metrics

# Extracting data

In [41]:
data = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/ML : Rain Prediction/weatherAUS.csv')
display(data.head(5))

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


# EDA and Feature Engineering

In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [44]:
# Checking that it rained both today and tomorrow
data[(data['RainToday']=='Yes') & (data['RainTomorrow']=='Yes')].head(2)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
11,2008-12-12,Albury,15.9,21.7,2.2,,,NNE,31.0,NE,...,89.0,91.0,1010.5,1004.2,8.0,8.0,15.9,17.0,Yes,Yes
12,2008-12-13,Albury,15.9,18.6,15.6,,,W,61.0,NNW,...,76.0,93.0,994.3,993.0,8.0,8.0,17.4,15.8,Yes,Yes


In [45]:
# Checking that it didn't rain both today and tomorrow
data[(data['RainToday']=='No') & (data['RainTomorrow']=='No')].head(2)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No


### Evaluating each column for Categorical or Continous data

In [46]:
print("** Categorical Columns **")
for i in data.columns:
  if (len(data[i].unique()) < 25):
    print(i, len(data[i].unique()))

print("\n** Continous Columns **")
for i in data.columns:
  if (len(data[i].unique()) > 25):
    print(i, len(data[i].unique()))

** Categorical Columns **
WindGustDir 17
WindDir9am 17
WindDir3pm 17
Cloud9am 11
Cloud3pm 11
RainToday 3
RainTomorrow 3

** Continous Columns **
Date 3436
Location 49
MinTemp 390
MaxTemp 506
Rainfall 682
Evaporation 359
Sunshine 146
WindGustSpeed 68
WindSpeed9am 44
WindSpeed3pm 45
Humidity9am 102
Humidity3pm 102
Pressure9am 547
Pressure3pm 550
Temp9am 442
Temp3pm 503


In [47]:
# Now, checking if Categorical columns are Numerical or String

for i in ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'Cloud9am', 'Cloud3pm', 'RainToday', 'RainTomorrow']:
  print(i, "\t\t:", data[i].dtype,"\t>>", data[i].sort_values().unique(),"\n")

WindGustDir 		: object 	>> ['E' 'ENE' 'ESE' 'N' 'NE' 'NNE' 'NNW' 'NW' 'S' 'SE' 'SSE' 'SSW' 'SW' 'W'
 'WNW' 'WSW' nan] 

WindDir9am 		: object 	>> ['E' 'ENE' 'ESE' 'N' 'NE' 'NNE' 'NNW' 'NW' 'S' 'SE' 'SSE' 'SSW' 'SW' 'W'
 'WNW' 'WSW' nan] 

WindDir3pm 		: object 	>> ['E' 'ENE' 'ESE' 'N' 'NE' 'NNE' 'NNW' 'NW' 'S' 'SE' 'SSE' 'SSW' 'SW' 'W'
 'WNW' 'WSW' nan] 

Cloud9am 		: float64 	>> [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. nan] 

Cloud3pm 		: float64 	>> [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. nan] 

RainToday 		: object 	>> ['No' 'Yes' nan] 

RainTomorrow 		: object 	>> ['No' 'Yes' nan] 



In [48]:
# Now, checking if Continous columns are Numerical or String

for i in data.columns:
  if (i not in ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'Cloud9am', 'Cloud3pm', 'RainToday', 'RainTomorrow']):
    print(i, "\t:", data[i].dtype)

Date 	: object
Location 	: object
MinTemp 	: float64
MaxTemp 	: float64
Rainfall 	: float64
Evaporation 	: float64
Sunshine 	: float64
WindGustSpeed 	: float64
WindSpeed9am 	: float64
WindSpeed3pm 	: float64
Humidity9am 	: float64
Humidity3pm 	: float64
Pressure9am 	: float64
Pressure3pm 	: float64
Temp9am 	: float64
Temp3pm 	: float64


Now the categorical and continous columns having data of string datatype, needs to be encoded

Since, there is no preference mentioned, so we should do one-hot encoding

### Checking for NULL values

In [49]:
(data.isnull().sum()/len(data)*100).sort_values()

Date              0.000000
Location          0.000000
MaxTemp           0.866905
MinTemp           1.020899
Temp9am           1.214767
WindSpeed9am      1.214767
Humidity9am       1.824557
WindSpeed3pm      2.105046
Rainfall          2.241853
RainToday         2.241853
RainTomorrow      2.245978
Temp3pm           2.481094
WindDir3pm        2.906641
Humidity3pm       3.098446
WindGustSpeed     7.055548
WindGustDir       7.098859
WindDir9am        7.263853
Pressure3pm      10.331363
Pressure9am      10.356799
Cloud9am         38.421559
Cloud3pm         40.807095
Evaporation      43.166506
Sunshine         48.009762
dtype: float64

These columns are having maximum NULL values and needs to be populated using Machine Learning method:

> Cloud9am         38.421559

> Cloud3pm         40.807095

> Evaporation      43.166506

> Sunshine         48.009762

For rest of the columns, the value can be populated using MODE

### Encoding

In [50]:
# Listing down columns that need to be encoded
for i in data.columns:
  if (data[i].dtype == 'object'):
    print(i)

Date
Location
WindGustDir
WindDir9am
WindDir3pm
RainToday
RainTomorrow


##### Encoding : Date

In [51]:
# Keeping only month from the DATE column

data['Date']=pd.to_datetime(data['Date'], format = '%Y-%m-%d').dt.month
data['Date'].unique()

array([12,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

##### Encoding : Location, WindGustDir, WindDir9am, WindDir3pm

In [52]:
# Type 1 : Label Encoding 
# column names : Location, WindGustDir, WindDir9am, WindDir3pm
data_type1=data.copy()
data_type_bkp=data.copy()

from sklearn.preprocessing import LabelEncoder
label=LabelEncoder()
data_type1['Location']=label.fit_transform(data_type1['Location'])
data_type1['WindGustDir']=label.fit_transform(data_type1['WindGustDir'])
data_type1['WindDir9am']=label.fit_transform(data_type1['WindDir9am'])
data_type1['WindDir3pm']=label.fit_transform(data_type1['WindDir3pm'])

# Saving the mapping details
data_type_bkp['Location1']=label.fit_transform(data_type_bkp['Location'])
data_type_bkp['WindGustDir1']=label.fit_transform(data_type_bkp['WindGustDir'])
data_type_bkp['WindDir9am1']=label.fit_transform(data_type_bkp['WindDir9am'])
data_type_bkp['WindDir3pm1']=label.fit_transform(data_type_bkp['WindDir3pm'])
data_type_bkp[['Location1', 'Location', 'WindGustDir1', 'WindGustDir', 'WindDir9am1', 'WindDir9am', 'WindDir3pm1', 'WindDir3pm']].drop_duplicates().to_csv('/content/drive/MyDrive/ColabNotebooks/ML : Rain Prediction/label_encoding_mapping.csv')
del data_type_bkp

In [53]:
# Type 2 : Manual Encoding 
# column names : Location, WindGustDir, WindDir9am, WindDir3pm
data_type2=data.copy()

##### Encoding : RainToday and RainTomorrow

In [54]:
# Encoding RainToday and RainTomorrow
data_encoded=data_type1.copy()

data_encoded['RainToday']=data_encoded['RainToday'].replace(['Yes'], 1)
data_encoded['RainToday']=data_encoded['RainToday'].replace(['No'], 0)

data_encoded['RainTomorrow']=data_encoded['RainTomorrow'].replace(['Yes'], 1)
data_encoded['RainTomorrow']=data_encoded['RainTomorrow'].replace(['No'], 0)

In [55]:
# Validation of encoded data
print(data_encoded['RainToday'].unique())
print(data_encoded['RainTomorrow'].unique())

print(data_encoded['RainToday'].mode())
print(data_encoded['RainTomorrow'].mode())

[ 0.  1. nan]
[ 0.  1. nan]
0    0.0
dtype: float64
0    0.0
dtype: float64


### Removing NULL values

##### Populating MODE : except for Cloud9am, Cloud3pm, Evaporation and Sunshine

In [56]:
# Populating NULL values using MODE except for these 'Cloud9am', 'Cloud3pm', 'Evaporation' and 'Sunshine' columns
for i in data_encoded.columns:
  if (i not in ['Cloud9am', 'Cloud3pm', 'Evaporation', 'Sunshine']):
    data_encoded[i].fillna(data_encoded[i].mode()[0], inplace=True)

# Validation
(data_encoded.isnull().sum()/len(data_encoded)*100).sort_values()

Date              0.000000
Temp3pm           0.000000
Temp9am           0.000000
Pressure3pm       0.000000
Pressure9am       0.000000
Humidity3pm       0.000000
Humidity9am       0.000000
WindSpeed3pm      0.000000
RainToday         0.000000
WindDir3pm        0.000000
WindSpeed9am      0.000000
WindGustSpeed     0.000000
WindGustDir       0.000000
Rainfall          0.000000
MaxTemp           0.000000
MinTemp           0.000000
Location          0.000000
WindDir9am        0.000000
RainTomorrow      0.000000
Cloud9am         38.421559
Cloud3pm         40.807095
Evaporation      43.166506
Sunshine         48.009762
dtype: float64

In [57]:
# Validation
for i in data_encoded.columns:
  print(i, data_encoded[i].sort_values().unique())

Date [ 1  2  3  4  5  6  7  8  9 10 11 12]
Location [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48]
MinTemp [-8.5 -8.2 -8.  -7.8 -7.6 -7.5 -7.3 -7.2 -7.1 -7.  -6.9 -6.8 -6.7 -6.6
 -6.5 -6.4 -6.3 -6.2 -6.1 -6.  -5.9 -5.8 -5.7 -5.6 -5.5 -5.4 -5.3 -5.2
 -5.1 -5.  -4.9 -4.8 -4.7 -4.6 -4.5 -4.4 -4.3 -4.2 -4.1 -4.  -3.9 -3.8
 -3.7 -3.6 -3.5 -3.4 -3.3 -3.2 -3.1 -3.  -2.9 -2.8 -2.7 -2.6 -2.5 -2.4
 -2.3 -2.2 -2.1 -2.  -1.9 -1.8 -1.7 -1.6 -1.5 -1.4 -1.3 -1.2 -1.1 -1.
 -0.9 -0.8 -0.7 -0.6 -0.5 -0.4 -0.3 -0.2 -0.1  0.   0.1  0.2  0.3  0.4
  0.5  0.6  0.7  0.8  0.9  1.   1.1  1.2  1.3  1.4  1.5  1.6  1.7  1.8
  1.9  2.   2.1  2.2  2.3  2.4  2.5  2.6  2.7  2.8  2.9  3.   3.1  3.2
  3.3  3.4  3.5  3.6  3.7  3.8  3.9  4.   4.1  4.2  4.3  4.4  4.5  4.6
  4.7  4.8  4.9  5.   5.1  5.2  5.3  5.4  5.5  5.6  5.7  5.8  5.9  6.
  6.1  6.2  6.3  6.4  6.5  6.6  6.7  6.8  6.9  7.   7.1  7.2  7.3  7.4
  7.5  7.6

### Predicting data for NULL values

Cloud9am and Cloud3pm are categorical columns, so Classficiation Machine Learning should be used

Evaporation and Sunshine are continous columns, so Regression Machine Learning should be used

In [58]:
data1=data_encoded.copy()

##### Predicting value for Cloud9am

In [59]:
# Cloud9am

# Splitting input and output features
x=pd.DataFrame(data1.drop(['Cloud9am', 'Cloud3pm', 'Evaporation', 'Sunshine'], axis=1), columns=[i for i in data1.columns if (i not in ['Cloud9am', 'Cloud3pm', 'Evaporation', 'Sunshine'])])
y=pd.DataFrame(data1['Cloud9am'], columns=['Cloud9am'])

# Splitting training and testing data
x_train=pd.DataFrame(x[(y['Cloud9am'].isnull()==False)], columns=x.columns)
y_train=pd.DataFrame(y[(y['Cloud9am'].isnull()==False)], columns=y.columns)
x_test=pd.DataFrame(x[(y['Cloud9am'].isnull())], columns=x.columns)
y_test=pd.DataFrame(y[(y['Cloud9am'].isnull())], columns=y.columns)

# Logistic Regression was causing slowness in the execution
#from sklearn.linear_model import LogisticRegression
#model=LogisticRegression(max_iter=10e5)

from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier()
model.fit(x_train, y_train)
y_pred=model.predict(x_test)

y_pred=pd.DataFrame(y_pred, columns=['Cloud9am'], index=y_test.index)

zz=y_train.append(y_pred)
data1['Cloud9am']=zz.sort_index()

KeyboardInterrupt: ignored

##### Predicting value for Cloud3pm

In [None]:
# Cloud3pm

# Splitting input and output features
x=pd.DataFrame(data1.drop(['Cloud9am', 'Cloud3pm', 'Evaporation', 'Sunshine'], axis=1), columns=[i for i in data1.columns if (i not in ['Cloud9am', 'Cloud3pm', 'Evaporation', 'Sunshine'])])
y=pd.DataFrame(data1['Cloud3pm'], columns=['Cloud3pm'])

# Splitting training and testing data
x_train=pd.DataFrame(x[(y['Cloud3pm'].isnull()==False)], columns=x.columns)
y_train=pd.DataFrame(y[(y['Cloud3pm'].isnull()==False)], columns=y.columns)
x_test=pd.DataFrame(x[(y['Cloud3pm'].isnull())], columns=x.columns)
y_test=pd.DataFrame(y[(y['Cloud3pm'].isnull())], columns=y.columns)

# Logistic Regression was causing slowness in the execution
#from sklearn.linear_model import LogisticRegression
#model=LogisticRegression(max_iter=10e5)

from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier()
model.fit(x_train, y_train)
y_pred=model.predict(x_test)

y_pred=pd.DataFrame(y_pred, columns=['Cloud3pm'], index=y_test.index)

zz=y_train.append(y_pred)
data1['Cloud3pm']=zz.sort_index()

##### Predicting value for Evaporation

In [None]:
# Evaporation

# Splitting input and output features
x=pd.DataFrame(data1.drop(['Cloud9am', 'Cloud3pm', 'Evaporation', 'Sunshine'], axis=1), columns=[i for i in data1.columns if (i not in ['Cloud9am', 'Cloud3pm', 'Evaporation', 'Sunshine'])])
y=pd.DataFrame(data1['Evaporation'], columns=['Evaporation'])

# Splitting training and testing data
x_train=pd.DataFrame(x[(y['Evaporation'].isnull()==False)], columns=x.columns)
y_train=pd.DataFrame(y[(y['Evaporation'].isnull()==False)], columns=y.columns)
x_test=pd.DataFrame(x[(y['Evaporation'].isnull())], columns=x.columns)
y_test=pd.DataFrame(y[(y['Evaporation'].isnull())], columns=y.columns)

from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(x_train, y_train)
y_pred=model.predict(x_test)

y_pred=pd.DataFrame(y_pred, columns=['Evaporation'], index=y_test.index)

zz=y_train.append(y_pred)
data1['Evaporation']=zz.sort_index()

##### Predicting value for Sunshine

In [None]:
# Sunshine

# Splitting input and output features
x=pd.DataFrame(data1.drop(['Cloud9am', 'Cloud3pm', 'Evaporation', 'Sunshine'], axis=1), columns=[i for i in data1.columns if (i not in ['Cloud9am', 'Cloud3pm', 'Evaporation', 'Sunshine'])])
y=pd.DataFrame(data1['Sunshine'], columns=['Sunshine'])

# Splitting training and testing data
x_train=pd.DataFrame(x[(y['Sunshine'].isnull()==False)], columns=x.columns)
y_train=pd.DataFrame(y[(y['Sunshine'].isnull()==False)], columns=y.columns)
x_test=pd.DataFrame(x[(y['Sunshine'].isnull())], columns=x.columns)
y_test=pd.DataFrame(y[(y['Sunshine'].isnull())], columns=y.columns)

from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(x_train, y_train)
y_pred=model.predict(x_test)

y_pred=pd.DataFrame(y_pred, columns=['Sunshine'], index=y_test.index)

zz=y_train.append(y_pred)
data1['Sunshine']=zz.sort_index()

In [None]:
# Updating the datatype from OBJECT to INT

for i in ['Cloud9am', 'Cloud3pm', 'RainToday', 'RainTomorrow']:
  data1[i]=data1[i].astype(int)

In [None]:
# Validation of NULL values
data1.isnull().sum()

# Validation of unique data
for i in data1.columns:
  print(i, data1[i].sort_values().unique())

### Saving clean data

In [None]:
data1.to_csv('/content/drive/MyDrive/ColabNotebooks/ML : Rain Prediction/cleaned_data.csv')

### Checking Correlation

In [None]:
#import seaborn as sns
#sns.heatmap(data1, annot=True)

### Checking for Outliers

In [None]:
data1.plot(kind='box', subplots=True, figsize=(20,40), layout=(5,5))

### Feature Scaling

Scaling is not need for 
* WindGustDir
* WindDir9am
* WindDir3pm
* Cloud9am
* Cloud3pm
* RainToday
* RainTomorrow 

columns as these are categorical columns

##### Scaling features for rest of the columns

In [None]:
for i in data1.columns:
  if (i in ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'Cloud9am', 'Cloud3pm', 'RainToday', 'RainTomorrow']):
    print(i, data1[i].sort_values().unique())

In [None]:
# Pre-validation
test=pd.DataFrame({})
for i in data1.columns:
  if (i not in ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'Cloud9am', 'Cloud3pm', 'RainToday', 'RainTomorrow']):
      test=test.append({
        "Column":i,
        "Min":round(data1[i].min(),2),
        "Max":round(data1[i].max(),2),
        "Mean":round(data1[i].mean(),2),
        "SD":round(data1[i].std(),2)
    }, ignore_index=True)
display(test)

test.to_csv('/content/drive/MyDrive/ColabNotebooks/ML : Rain Prediction/scaling_mapping.csv')

In [None]:
data2=data1.copy()

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
for i in data2.columns:
  if (i not in ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'Cloud9am', 'Cloud3pm', 'RainToday', 'RainTomorrow']):
    data2[i]=scaler.fit_transform(pd.DataFrame(data2[i], columns=[i]))

In [None]:
# Post-validation
test=pd.DataFrame({})
for i in data2.columns:
  if (i not in ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'Cloud9am', 'Cloud3pm', 'RainToday', 'RainTomorrow']):
    test=test.append({
        "Column":i,
        "Min":round(data2[i].min(),2),
        "Max":round(data2[i].max(),2),
        "Mean":round(data2[i].mean(),2),
        "SD":round(data2[i].std(),2)
    }, ignore_index=True)
display(test)

# Model Creation

In [None]:
x=data2.drop(['RainTomorrow'], axis=1)
y=pd.DataFrame(data2['RainTomorrow'], columns=['RainTomorrow'])

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7, shuffle=True)

In [None]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(max_iter=10e5)
model.fit(x_train, y_train)
y_pred=model.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred))


from sklearn.neighbors import KNeighborsClassifier
model=KNeighborsClassifier()
model.fit(x_train, y_train)
y_pred=model.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred))


from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier()
model.fit(x_train, y_train)
y_pred=model.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred))


from sklearn.ensemble import AdaBoostClassifier
model=AdaBoostClassifier()
model.fit(x_train, y_train)
y_pred=model.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred))


from xgboost import XGBClassifier
model=XGBClassifier()
model.fit(x_train, y_train)
y_pred=model.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(random_state=7)
model.fit(x_train, y_train)
y_pred=model.predict(x_test)

from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred))


final_model=model

So, the best model is RandomForestClassifier with accuracy_score of 88.6%

In [None]:
# Trying if PCA can imporve the score

x=data2.drop(['RainTomorrow'], axis=1)
y=pd.DataFrame(data2['RainTomorrow'], columns=['RainTomorrow'])

from sklearn.decomposition import PCA
for i in range(1,22,3):
  pca=PCA(n_components=i)
  x_new=pd.DataFrame(pca.fit_transform(x), columns=[np.char.add("PCA", np.arange(i).astype(str))])

  from sklearn.model_selection import train_test_split
  x_train, x_test, y_train, y_test = train_test_split(x_new, y, test_size=0.2, random_state=7, shuffle=True)

  from sklearn.ensemble import RandomForestClassifier
  model=RandomForestClassifier(random_state=7)
  model.fit(x_train, y_train)
  y_pred=model.predict(x_test)
  print(i, metrics.accuracy_score(y_test, y_pred))

  
# As we can see it didn't gave any better results than 88.6%, 
# reason being PCA eleminates input features which contributes to training of model
# PCA only enhances the speed of model by compromising little bit with the score.

# Saving the model

In [None]:
import pickle
pickle.dump(final_model, open('/content/drive/MyDrive/ColabNotebooks/ML : Rain Prediction/model_rain_prediction.pkl', 'wb'))

# Testing the final model

In [None]:
data_test=pd.read_csv('/content/drive/MyDrive/ColabNotebooks/ML : Rain Prediction/weatherAUS.csv')
display(data_test.head(1))
print(list(data_test.loc[0]))

In [None]:
display(data_test[data_test['RainTomorrow']=='Yes'].head(1))
print(np.array(data_test[data_test['RainTomorrow']=='Yes'].head(1)))

In [None]:
"""input_data=['2008-12-01', 'Albury', 13.4, 22.9, 0.6, 12, 2, 'W', 44.0, 
            'W', 'WNW', 20.0, 24.0, 71.0, 22.0, 1007.7, 1007.1, 8.0, 3.0, 16.9, 
            21.8, 'No', 'No']"""
input_data=['2008-12-09', 'Albury', 9.7, 31.9, 0.0, 8.777944,	10.686714	, 'NNW', 80.0,
        'SE', 'NW', 7.0, 28.0, 42.0, 9.0, 1008.9, 1003.6, 7, 8, 18.3,
        30.2, 'No', 'Yes']
a=pd.to_datetime(input_data[0], format = '%Y-%m-%d').month
#print(a)

label_encoding=pd.read_csv('/content/drive/MyDrive/ColabNotebooks/ML : Rain Prediction/label_encoding_mapping.csv').drop('Unnamed: 0', axis=1)
b=label_encoding[label_encoding['Location']==input_data[1]]['Location1'].unique()[0]
c=label_encoding[label_encoding['WindGustDir']==input_data[7]]['WindGustDir1'].unique()[0]
d=label_encoding[label_encoding['WindDir9am']==input_data[9]]['WindDir9am1'].unique()[0]
e=label_encoding[label_encoding['WindDir3pm']==input_data[10]]['WindDir3pm1'].unique()[0]
#print(b,c,d,e)

f= 1 if (input_data[21]=='Yes') else 0
#print(f)

scaling=pd.read_csv('/content/drive/MyDrive/ColabNotebooks/ML : Rain Prediction/scaling_mapping.csv').drop('Unnamed: 0', axis=1)
# scaling should not be done for this : ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'Cloud9am', 'Cloud3pm', 'RainToday', 'RainTomorrow']
#print(scaling)

#print("\n ******************** \n")

c1=(a-scaling[scaling['Column']=='Date']['Mean'].values[0])/scaling[scaling['Column']=='Date']['SD'].values[0]
c2=(b-scaling[scaling['Column']=='Location']['Mean'].values[0])/scaling[scaling['Column']=='Location']['SD'].values[0]
c3=(input_data[2]-scaling[scaling['Column']=='MinTemp']['Mean'].values[0])/scaling[scaling['Column']=='MinTemp']['SD'].values[0]
c4=(input_data[3]-scaling[scaling['Column']=='MaxTemp']['Mean'].values[0])/scaling[scaling['Column']=='MaxTemp']['SD'].values[0]
c5=(input_data[4]-scaling[scaling['Column']=='Rainfall']['Mean'].values[0])/scaling[scaling['Column']=='Rainfall']['SD'].values[0]
c6=(input_data[5]-scaling[scaling['Column']=='Evaporation']['Mean'].values[0])/scaling[scaling['Column']=='Evaporation']['SD'].values[0]
c7=(input_data[6]-scaling[scaling['Column']=='Sunshine']['Mean'].values[0])/scaling[scaling['Column']=='Sunshine']['SD'].values[0]
c8=c
c9=(input_data[8]-scaling[scaling['Column']=='WindGustSpeed']['Mean'].values[0])/scaling[scaling['Column']=='WindGustSpeed']['SD'].values[0]
c10=d
c11=e
c12=(input_data[11]-scaling[scaling['Column']=='WindSpeed9am']['Mean'].values[0])/scaling[scaling['Column']=='WindSpeed9am']['SD'].values[0]
c13=(input_data[12]-scaling[scaling['Column']=='WindSpeed3pm']['Mean'].values[0])/scaling[scaling['Column']=='WindSpeed3pm']['SD'].values[0]
c14=(input_data[13]-scaling[scaling['Column']=='Humidity9am']['Mean'].values[0])/scaling[scaling['Column']=='Humidity9am']['SD'].values[0]
c15=(input_data[14]-scaling[scaling['Column']=='Humidity3pm']['Mean'].values[0])/scaling[scaling['Column']=='Humidity3pm']['SD'].values[0]
c16=(input_data[15]-scaling[scaling['Column']=='Pressure9am']['Mean'].values[0])/scaling[scaling['Column']=='Pressure9am']['SD'].values[0]
c17=(input_data[16]-scaling[scaling['Column']=='Pressure3pm']['Mean'].values[0])/scaling[scaling['Column']=='Pressure3pm']['SD'].values[0]
c18=int(input_data[17])
c19=int(input_data[18])
c20=(input_data[19]-scaling[scaling['Column']=='Temp9am']['Mean'].values[0])/scaling[scaling['Column']=='Temp9am']['SD'].values[0]
c21=(input_data[20]-scaling[scaling['Column']=='Temp3pm']['Mean'].values[0])/scaling[scaling['Column']=='Temp3pm']['SD'].values[0]
c22=f
#print([c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16, c17, c18, c19, c20, c21, c22])


output_test = np.array([c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16, c17, c18, c19, c20, c21, c22]).reshape(1,22)
output_test2= pd.DataFrame(output_test, columns=['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday'])
print(final_model.predict(output_test2)[0])

# Challenges faced during development of this project:
1. Poor quality of data : Data was having a lot of NULL value for multiple columns.
2. Removing NULL values : Two processes were used to remove the NULL values-
  * Populating the missing values with MODE of that column
  * Predicting the missing values using MACHINE LEARNING algorithm (Random Forest Algorithm : For categorical data columns 
  and Linear Regression : For continous data columns)
  
  NOTE : Predicting missing values using ML was used for columns having more than 35% missing values
3. Remove unimportant data : The DATE column was having data in the format of YYYY-MM-DD, so we kept only MM and removed others
4. Final model selection : Tried multiple models mentioned below-
  * Logistict regression
  * K-Neighbour algorithm
  * Decision tree
  * Adaboost classifier
  * XGB classifier
  
  But the best model came out to be Random Forest Algorithm.
5. Predicting value of any new data : This was the most difficult task and took most the effort.
  * Tried to save the model in Pipeline, along with encoding and scaling but it failed.
  Reason being only few columns were supposed to be encoded and some needed to be scaled, but in a Pipeline all the columns are impacted. So, had to leave this process.
  * Tried to do encoding and scaling again in Streamlit.py but it took much effort. Also, for completing again same data had to be unloaded and encoding and scaling codes were supposed to be written down. So, skipped this process.
  * Finally, saved the mapping of encoding and scaling in a csv file and used that to encode and scale any new input data. Then already saved pickel of model was used to predict the data.

# Future Developments:
* We need to save one cleaned data (that shouldn't contain scaled data), this can be later utilized in building a tableau dashboard
  
  [Refer this video for creating dashboard](https://www.youtube.com/watch?v=uHGOnDdKpK4&list=PLZoTAELRMXVOFnfSwkB_uyr4FT-327noK&index=90&ab_channel=KrishNaik)