In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('weatherAUS.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df["RainToday"].replace({"No":0,"Yes":1},inplace=True)
df["RainTomorrow"].replace({"No":0,"Yes":1},inplace=True)

In [None]:
## Check data is balance or not

fig = plt.figure(figsize=(8,5))
df.RainTomorrow.value_counts().plot(kind='bar')
plt.show()

In [None]:
# Handling imbalance class

from sklearn.utils import resample
no = df[df["RainTomorrow"]==0]
yes = df[df["RainTomorrow"]==1]
yes_oversampled = resample(yes,
                        replace=True,
                        n_samples=len(no),
                        random_state=123)
oversampled = pd.concat([no,yes_oversampled])

In [None]:
fig = plt.figure(figsize=(8,5))
oversampled.RainTomorrow.value_counts().plot(kind='bar')
plt.show()

In [None]:
# missing values pattern

sns.heatmap(oversampled.isnull(),cbar=False,cmap="PuBu")

In [None]:
# Dealing with missing values

total = oversampled.isnull().sum().sort_values(
    ascending = False)
percent = (oversampled.isnull().sum()/
            oversampled.isnull().count()).sort_values(
                ascending = False)
missing = pd.concat([total,percent],axis=1,keys=['Total','Percent'])
missing.head(4)

In [None]:
# Transforming null values

oversampled.select_dtypes(include=["O"]).columns

In [None]:
# Impute catagorical values with mode

oversampled['Date'] = oversampled['Date'].fillna(oversampled['Date'].mode()[0])
oversampled['Location'] = oversampled['Location'].fillna(oversampled['Location'].mode()[0])
oversampled['WindGustDir'] = oversampled['WindGustDir'].fillna(oversampled['WindGustDir'].mode()[0])
oversampled['WindDir9am'] = oversampled['WindDir9am'].fillna(oversampled['WindDir9am'].mode()[0])
oversampled['WindDir3pm'] = oversampled['WindDir3pm'].fillna(oversampled['WindDir3pm'].mode()[0])

In [None]:
# Convert catagorical to numbers

from sklearn.preprocessing import LabelEncoder
lencoders = {}
for col in oversampled.select_dtypes(include=["O"]).columns:
    lencoders[col] = LabelEncoder()
    oversampled[col] = lencoders[col].fit_transform(oversampled[col])

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Multiple imputation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
MiceImputed = oversampled.copy(deep=True)
mice_imputer = IterativeImputer()
MiceImputed.iloc[:,:] = mice_imputer.fit_transform(oversampled)

In [None]:
# Outliers treatment

Q1 = MiceImputed.quantile(0.25)
Q3 = MiceImputed.quantile(0.75)
IQR = Q3 - Q1
print(IQR)

In [None]:
outlier_mask = ((MiceImputed<(Q3-1.5*IQR)) | (MiceImputed>(Q3+1.5*IQR)).any(axis=1))
MiceImputed = MiceImputed[~outlier_mask]
MiceImputed.shape

In [None]:
# Correlation heatmap

corr = MiceImputed.corr()
mask = np.triu(np.ones_like(corr,dtype=bool))
f, ax = plt.subplots(figsize=(20,20))
cmap = sns.diverging_palette(250,25,as_cmap=True)
sns.heatmap(corr,mask=mask,cmap=cmap,vmax=None,center=0,
            square=True,annot=True,linewidths=5,
            cbar_kws={'shrink':.9})

In [None]:
sns.pairplot(data=MiceImputed,vars=('MaxTemp','MinTemp','Pressure9am',
                                    'Pressure3pm','Temp9am','Temp3pm',
                                    'Evaporation'),hue="RainTomorrow")

In [None]:
# Using standard scaler

from sklearn.preprocessing import StandardScaler,MinMaxScaler
r_scaler = MinMaxScaler()
r_scaler.fit(MiceImputed)
modified_data = pd.DataFrame(r_scaler.transform(MiceImputed),
                            index=MiceImputed.index,
                            columns=MiceImputed.columns)

In [None]:
# Feature scaling

from sklearn.feature_selection import SelectKBest,chi2
x = modified_data.loc[:,
                    modified_data.columns != 'RainTomorrow']
y = modified_data['RainTomorrow']
selector = SelectKBest(chi2,k=10)
selector.fit(x,y)
x_new = selector.transform(x)
print(x.columns[selector.get_support(indices=True)])

In [None]:
# Train diff algo

features = MiceImputed.drop('RainTomorrow',axis=1)
target = MiceImputed['RainTomorrow']

In [None]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(
    features,target,test_size=0.25,random_state=12345)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
# Ploting roc curve

def plot_roc_curve(fpr,tpr):
    plt.plot(fpr,tpr,color='orange',label='ROC')
    plt.plot([0,1],[0,1],color='darkblue',linestyle='--')
    plt.legend()
    plt.show()

In [None]:
## Lin Regg
## Decision Tree
## Neural Networks
## Random forest
## Light GBM
## XGBoost
## catboost