### Import the data and get a look at the datatypes

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [0]:
rainfall = pd.read_csv('https://raw.githubusercontent.com/Erivera96/MachineLearningExercises/master/weatherAUS.csv')

In [0]:
rainfall.describe()

In [0]:
print("The rainfall dataset has ",rainfall.shape[0]," columns and ",rainfall.shape[1]," rows.")
rainfall_dtypes = rainfall.dtypes
print("And has the following datatypes:\n", rainfall_dtypes)

###After looking at the datatypes, we decide what we don't want to keep: Date which doesn't contribute much information other than distinguishing days and risk_mm which was adviced to be dropped on the kaggle dataset page.

In [0]:
rainfall_data = rainfall.drop(['Date','RISK_MM'],axis=1)
print(rainfall_data.shape)

features = rainfall_data.drop('RainTomorrow',axis=1)
labels = rainfall_data['RainTomorrow']

###After removing things, we split the features from the labels

In [0]:
print(features.shape)
print(labels.shape)

###We want to seperate now, the numerical data from the categorical to work on each seperately.

In [0]:
features_dtypes = features.dtypes

# find which features are numerical and which are categorical,
# this returns vectors of T or F
bool_num = features_dtypes != 'object'
bool_cat = features_dtypes == 'object'

feat_names = features.columns # returns just the name

num_feat = features[feat_names[bool_num]]
cat_feat = features[feat_names[bool_cat]]

print("There are ", features_dtypes.size," features, where ", sum(bool_num), " are numerical and ", sum(bool_cat), " are categorical")

####How many of those are nan for each category?

In [0]:
number_of_features = features.shape[0]

print("Percent N/A for Categorical:")
for feat in cat_feat:
    print(feat,'\t\t',np.sum(cat_feat[feat].isna())/number_of_features*100)

print("\nPercent N/A for Numerical:")
for feat in num_feat:
    print(feat, '\t\t',np.sum(num_feat[feat].isna())/number_of_features*100)

In [0]:
#NOTICE IN NUMERICAL FEATURES: Sunshine (and others) has so many empty records that 
# even if we fill with the mean or median, it would just not be accurate to do, 
# therefore, we will drop this feature (and the others) entirely.

num_feat = num_feat.drop(['Evaporation','Sunshine', 'Cloud9am', 'Cloud3pm'], axis=1)


In [0]:
# NOTICE IN CATEGORICAL FEATURES: We have so few of the features being NANs, that
# we may as well remove those records entirely, thus we will remove ALL the vectors 
# that contain nan in the categorical.

master_bool = np.full((features.shape[0],),False,dtype=bool)
for cfeat in cat_feat:
    master_bool = np.any([master_bool,cat_feat[cfeat].isna()],axis=0)
np.sum(master_bool)/features.shape[0]*100
master_bool = np.logical_not(master_bool)

In [0]:
cat_feat = cat_feat[master_bool]
num_feat = num_feat[master_bool]
labels = labels[master_bool]

In [0]:
# Double check that we removed them
np.sum(cat_feat.isna())

###Now, we take care of NANS in numerical but first we want to split our training and testing data

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

#### Remove NANS in the numerical data using an imputer

In [0]:
seed = 37
train_num, test_num = train_test_split(num_feat, random_state=seed, test_size=0.2)

simputer = SimpleImputer(strategy='mean')
simputer.fit(train_num)

train_num_X = simputer.transform(train_num)
test_num_X = simputer.transform(test_num)

#### OneHot encode all the categorical features

In [0]:
from sklearn.preprocessing import OneHotEncoder

onehot = OneHotEncoder(sparse=False)
cat_feat_X = onehot.fit_transform(cat_feat)

train_cat_X, test_cat_X = train_test_split(cat_feat_X, random_state=seed, test_size=0.2)

### Now combine the features: Train vs Test

In [0]:
train_X = np.hstack( (train_cat_X, train_num_X) )
test_X = np.hstack( (test_cat_X, test_num_X) )

### Now we split the labels

In [0]:
y = (labels == 'Yes').astype(int) # turn into numbers cause we can operate on numbers

train_y, test_y = train_test_split(y, random_state=seed, test_size=0.2)

### Lets make sure our matricies shapes match

In [0]:
print(train_X.shape)
print(test_X.shape)
print(train_y.shape)
print(test_y.shape)

###OK now we're cooking. We want to scale our data before we get to the meat of it

In [0]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()

mms.fit(train_X)

train_X = mms.transform(train_X)
test_X = mms.transform(test_X)

### Now we do a logistic regression to find out if it will rain tomorrow

In [0]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver='saga',random_state=seed)
logreg.fit(train_X, train_y)

logreg.score(test_X, test_y)