# Data: https://www.kaggle.com/jsphyg/weather-dataset-rattle-package?select=weatherAUS.csv

# Import data

In [1]:
import pandas as pd 
import seaborn as sns

In [3]:
data = pd.read_csv('weatherAUS.csv')

In [4]:
data.tail()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
145455,2017-06-21,Uluru,2.8,23.4,0.0,,,E,31.0,SE,...,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No
145456,2017-06-22,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,...,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No
145457,2017-06-23,Uluru,5.4,26.9,0.0,,,N,37.0,SE,...,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No
145458,2017-06-24,Uluru,7.8,27.0,0.0,,,SE,28.0,SSE,...,51.0,24.0,1019.4,1016.5,3.0,2.0,15.1,26.0,No,No
145459,2017-06-25,Uluru,14.9,,0.0,,,,,ESE,...,62.0,36.0,1020.2,1017.9,8.0,8.0,15.0,20.9,No,


In [5]:
X = data.iloc[:,:-1]

In [6]:
X.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,24.0,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,22.0,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,26.0,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,9.0,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No


In [7]:
y = data.iloc[:,-1]

In [8]:
y.head()

0    No
1    No
2    No
3    No
4    No
Name: RainTomorrow, dtype: object

# Handling missing data - Numeric type

In [9]:
import numpy as np
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

In [10]:
#X.dtypes

In [11]:
numerical_cols = list(np.where((X.dtypes == np.int64) | (X.dtypes == np.float64))[0])

In [12]:
imp_mean.fit(X.iloc[:,numerical_cols])

SimpleImputer()

In [13]:
X.iloc[:,numerical_cols] = imp_mean.transform(X.iloc[:,numerical_cols])

### Handling missing string data

In [14]:
string_cols = list(np.where((X.dtypes == np.object))[0])

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  string_cols = list(np.where((X.dtypes == np.object))[0])


In [15]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [16]:
imp_mean.fit(X.iloc[:,string_cols])

SimpleImputer(strategy='most_frequent')

In [17]:
X.iloc[:,string_cols] = imp_mean.transform(X.iloc[:,string_cols])

# One Hot encoder method

In [18]:
def OneHotEncoderMethod(indices, data):
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder
    columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(),indices )], remainder='passthrough')
    return columnTransformer.fit_transform(data)

# Label encoding method

In [19]:
def LabelEncoderMethod(series):
    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()
    return le.fit_transform(series) 

# Label encoding target feature

In [20]:
y = LabelEncoderMethod(y)

# Encoding selection for X

In [21]:
def EncodingSelection(X, threshold=10):
    # Step 01 : Select the string col
    string_cols = list(np.where((X.dtypes == np.object))[0])
    one_hot_encoding_indices = []
    
    # Step 02: The number of categoty is 2 and more than threshold, label encode
    for col in string_cols:
        lenght = len(pd.unique(X[X.columns[col]]))
        if lenght == 2 or lenght > threshold:
            X[X.columns[col]] = LabelEncoderMethod(X[X.columns[col]])
        else:
            one_hot_encoding_indices.append(col)
            
    # Step 03: One hot encode otherwise 
    X = OneHotEncoderMethod(one_hot_encoding_indices, X)
    return X

In [22]:
X = EncodingSelection(X)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  string_cols = list(np.where((X.dtypes == np.object))[0])


In [23]:
X.shape

(145460, 22)

# Feature selection

In [24]:
from sklearn.feature_selection import SelectKBest, chi2

In [25]:
kbest = SelectKBest(score_func=chi2, k=10)

In [26]:
from sklearn import preprocessing
MMS = preprocessing.MinMaxScaler()

In [27]:
x_temp = MMS.fit_transform(X)

In [28]:
x_temp = kbest.fit(x_temp,y)

In [29]:
best_features = np.argsort(x_temp.scores_)[-13:]

In [30]:
features_to_delete = best_features = np.argsort(x_temp.scores_)[:-13]

In [31]:
X = np.delete(X, features_to_delete, axis=1)

In [32]:
X.shape

(145460, 13)

In [33]:
del x_temp

# Train test split

In [34]:
import numpy as np
from sklearn.model_selection import train_test_split

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [36]:
print(X_train.shape)

(116368, 13)


# Feature scaling

### Standardization: (X - mean(X)) / std(X)

### Normalization = (x - min(x)) / (max(x) - min(x))

In [37]:
from sklearn import preprocessing

In [38]:
sc = preprocessing.StandardScaler(with_mean=False)

In [39]:
sc.fit(X_train)

StandardScaler(with_mean=False)

In [40]:
X_train = sc.transform(X_train)

In [41]:
print(X_train.shape)

(116368, 13)


In [42]:
X_test = sc.transform(X_test)

In [43]:
print(X_test.shape)

(29092, 13)


#### The data is ready!!

# Building KNN model

In [44]:
from sklearn.neighbors import KNeighborsClassifier

In [45]:
knnClassifier = KNeighborsClassifier(n_neighbors=3)

In [46]:
knnClassifier.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=3)

In [47]:
y_pred = knnClassifier.predict(X_test)

In [48]:
from sklearn.metrics import accuracy_score

In [49]:
accuracy_score(y_pred,y_test)*100

81.0463357624089

In [50]:
len(y_pred)

29092

In [51]:
X_test.shape

(29092, 13)

In [52]:
X_test[0]

array([  4.30960243,   1.92127256,   2.7850948 ,   0.85185523,
         2.36561739,   3.87649437,   2.20117574, 151.27704546,
       152.32027943,   1.9640909 ,   2.15541801,   4.30029032,
         2.41793039])

In [53]:
knnClassifier.predict([X_test[0]])

array([0])

In [54]:
knnClassifier.predict_proba([X_test[0]])

array([[1., 0., 0.]])

In [137]:
# pip install mlxtend