# Data: https://www.kaggle.com/jsphyg/weather-dataset-rattle-package?select=weatherAUS.csv

# Import data

In [1]:
import pandas as pd 
import seaborn as sns

In [2]:
data = pd.read_csv('weatherAUS.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'weatherAUS.csv'

In [86]:
data.tail()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
142188,2017-06-20,Uluru,3.5,21.8,0.0,,,E,31.0,ESE,...,27.0,1024.7,1021.2,,,9.4,20.9,No,0.0,No
142189,2017-06-21,Uluru,2.8,23.4,0.0,,,E,31.0,SE,...,24.0,1024.6,1020.3,,,10.1,22.4,No,0.0,No
142190,2017-06-22,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,...,21.0,1023.5,1019.1,,,10.9,24.5,No,0.0,No
142191,2017-06-23,Uluru,5.4,26.9,0.0,,,N,37.0,SE,...,24.0,1021.0,1016.8,,,12.5,26.1,No,0.0,No
142192,2017-06-24,Uluru,7.8,27.0,0.0,,,SE,28.0,SSE,...,24.0,1019.4,1016.5,3.0,2.0,15.1,26.0,No,0.0,No


In [87]:
X = data.iloc[:,:-1]

In [88]:
X.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2


In [89]:
y = data.iloc[:,-1]

In [90]:
y.head()

0    No
1    No
2    No
3    No
4    No
Name: RainTomorrow, dtype: object

# Handling missing data - Numeric type

In [91]:
import numpy as np
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

In [92]:
#X.dtypes

In [93]:
numerical_cols = list(np.where((X.dtypes == np.int64) | (X.dtypes == np.float64))[0])

In [94]:
imp_mean.fit(X.iloc[:,numerical_cols])

SimpleImputer()

In [95]:
X.iloc[:,numerical_cols] = imp_mean.transform(X.iloc[:,numerical_cols])

### Handling missing string data

In [96]:
string_cols = list(np.where((X.dtypes == np.object))[0])

In [97]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [98]:
imp_mean.fit(X.iloc[:,string_cols])

SimpleImputer(strategy='most_frequent')

In [99]:
X.iloc[:,string_cols] = imp_mean.transform(X.iloc[:,string_cols])

# One Hot encoder method

In [100]:
def OneHotEncoderMethod(indices, data):
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder
    columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(),indices )], remainder='passthrough')
    return columnTransformer.fit_transform(data)

# Label encoding method

In [101]:
def LabelEncoderMethod(series):
    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()
    return le.fit_transform(series) 

# Label encoding target feature

In [102]:
y = LabelEncoderMethod(y)

# Encoding selection for X

In [103]:
def EncodingSelection(X, threshold=10):
    # Step 01 : Select the string col
    string_cols = list(np.where((X.dtypes == np.object))[0])
    one_hot_encoding_indices = []
    
    # Step 02: The number of categoty is 2 and more than threshold, label encode
    for col in string_cols:
        lenght = len(pd.unique(X[X.columns[col]]))
        if lenght == 2 or lenght > threshold:
            X[X.columns[col]] = LabelEncoderMethod(X[X.columns[col]])
        else:
            one_hot_encoding_indices.append(col)
            
    # Step 03: One hot encode otherwise 
    X = OneHotEncoderMethod(one_hot_encoding_indices, X)
    return X

In [104]:
X = EncodingSelection(X)

In [105]:
X.shape

(142193, 23)

# Feature selection

In [106]:
from sklearn.feature_selection import SelectKBest, chi2

In [107]:
kbest = SelectKBest(score_func=chi2, k=10)

In [108]:
from sklearn import preprocessing
MMS = preprocessing.MinMaxScaler()

In [109]:
x_temp = MMS.fit_transform(X)

In [110]:
x_temp = kbest.fit(x_temp,y)

In [111]:
best_features = np.argsort(x_temp.scores_)[-13:]

In [112]:
features_to_delete = best_features = np.argsort(x_temp.scores_)[:-13]

In [113]:
X = np.delete(X, features_to_delete, axis=1)

In [114]:
X.shape

(142193, 13)

In [115]:
del x_temp

# Train test split

In [116]:
import numpy as np
from sklearn.model_selection import train_test_split

In [117]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [118]:
print(X_train.shape)

(113754, 13)


# Feature scaling

### Standardization: (X - mean(X)) / std(X)

### Normalization = (x - min(x)) / (max(x) - min(x))

In [119]:
from sklearn import preprocessing

In [120]:
sc = preprocessing.StandardScaler(with_mean=False)

In [121]:
sc.fit(X_train)

StandardScaler(with_mean=False)

In [122]:
X_train = sc.transform(X_train)

In [123]:
print(X_train.shape)

(113754, 13)


In [124]:
X_test = sc.transform(X_test)

In [125]:
print(X_test.shape)

(28439, 13)


#### The data is ready!!

# Building KNN model

In [126]:
from sklearn.neighbors import KNeighborsClassifier

In [127]:
knnClassifier = KNeighborsClassifier(n_neighbors=3)

In [128]:
knnClassifier.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=3)

In [129]:
y_pred = knnClassifier.predict(X_test)

In [130]:
from sklearn.metrics import accuracy_score

In [131]:
accuracy_score(y_pred,y_test)*100

90.2774359154682

In [132]:
len(y_pred)

28439

In [133]:
X_test.shape

(28439, 13)

In [134]:
X_test[0]

array([3.78623150e+00, 2.56512193e+00, 3.25743087e+00, 3.12388536e+00,
       5.02076192e+00, 2.63139553e+00, 1.51473353e+02, 1.52574941e+02,
       3.51349210e+00, 2.37443526e+00, 3.78880591e+00, 2.41122226e+00,
       2.35788487e-02])

In [135]:
knnClassifier.predict([X_test[0]])

array([1])

In [136]:
knnClassifier.predict_proba([X_test[0]])

array([[0.33333333, 0.66666667]])

In [137]:
# pip install mlxtend