# Importing Data
https://www.kaggle.com/datasets/jsphyg/weather-dataset-rattle-package

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('weatherAUS.csv')

In [3]:
data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,01-12-2008,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,02-12-2008,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,03-12-2008,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,04-12-2008,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,05-12-2008,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


# Separating Features and Target Variable

In [4]:
X = data.iloc[:,:-1] 

In [5]:
X
print(X.dtypes)

Date              object
Location          object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed    float64
WindDir9am        object
WindDir3pm        object
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RainToday         object
dtype: object


In [6]:
Y = data.iloc[:,-1]

In [7]:
Y

0          No
1          No
2          No
3          No
4          No
         ... 
145455     No
145456     No
145457     No
145458     No
145459    Yes
Name: RainTomorrow, Length: 145460, dtype: object

# Handling Missing data

In [8]:
import numpy as np
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values = np.nan , strategy = 'mean')

In [9]:
# numerical_cols = [2,3,4,5,6,8,11,12,13,14,15,16,17,18,19,20]   # creating a list of columns whose value is numeric
# numerical_cols

In [10]:
# General ways to split columns whose value is numeric 

numerical_cols = list(np.where((X.dtypes == np.int64) | (X.dtypes == np.float64))[0])
#                   list(np.where((X.dtypes == np.int64) | (X.dtypes == np.float64)))

numerical_cols

[2, 3, 4, 5, 6, 8, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

In [11]:
imp_mean.fit(X.iloc[:,numerical_cols]) # now, fitting mean value in place of nan values based on the column which have numeric values

SimpleImputer()

In [12]:
X.iloc[:,numerical_cols] = imp_mean.transform(X.iloc[:,numerical_cols]) # updating/transforming the actual data i.e, X

In [13]:
print(X)

              Date Location  MinTemp    MaxTemp  Rainfall  Evaporation  \
0       01-12-2008   Albury     13.4  22.900000       0.6     5.468232   
1       02-12-2008   Albury      7.4  25.100000       0.0     5.468232   
2       03-12-2008   Albury     12.9  25.700000       0.0     5.468232   
3       04-12-2008   Albury      9.2  28.000000       0.0     5.468232   
4       05-12-2008   Albury     17.5  32.300000       1.0     5.468232   
...            ...      ...      ...        ...       ...          ...   
145455  21-06-2017    Uluru      2.8  23.400000       0.0     5.468232   
145456  22-06-2017    Uluru      3.6  25.300000       0.0     5.468232   
145457  23-06-2017    Uluru      5.4  26.900000       0.0     5.468232   
145458  24-06-2017    Uluru      7.8  27.000000       0.0     5.468232   
145459  25-06-2017    Uluru     14.9  23.221348       0.0     5.468232   

        Sunshine WindGustDir  WindGustSpeed WindDir9am  ... WindSpeed3pm  \
0       7.611178           W       

# Handling Missing String data

In [14]:
X.drop('Date',axis =1, inplace =True )
print(X)

       Location  MinTemp    MaxTemp  Rainfall  Evaporation  Sunshine  \
0        Albury     13.4  22.900000       0.6     5.468232  7.611178   
1        Albury      7.4  25.100000       0.0     5.468232  7.611178   
2        Albury     12.9  25.700000       0.0     5.468232  7.611178   
3        Albury      9.2  28.000000       0.0     5.468232  7.611178   
4        Albury     17.5  32.300000       1.0     5.468232  7.611178   
...         ...      ...        ...       ...          ...       ...   
145455    Uluru      2.8  23.400000       0.0     5.468232  7.611178   
145456    Uluru      3.6  25.300000       0.0     5.468232  7.611178   
145457    Uluru      5.4  26.900000       0.0     5.468232  7.611178   
145458    Uluru      7.8  27.000000       0.0     5.468232  7.611178   
145459    Uluru     14.9  23.221348       0.0     5.468232  7.611178   

       WindGustDir  WindGustSpeed WindDir9am WindDir3pm  ...  WindSpeed3pm  \
0                W       44.00000          W        WNW  

In [15]:
imp_mean = SimpleImputer(missing_values = np.nan , strategy = 'most_frequent')

In [16]:
# string_cols = [1,7,9,10,21] # manually creating list of columns which has String Data

In [17]:
# General way of creating list of columns which has String Data
string_cols = list((np.where(X.dtypes ==object))[0])
string_cols  

[0, 6, 8, 9, 20]

In [18]:
imp_mean.fit(X.iloc[:,string_cols])

SimpleImputer(strategy='most_frequent')

In [19]:
X.iloc[:,string_cols] = imp_mean.transform(X.iloc[:,string_cols])

In [20]:
# imp_mean = SimpleImputer(missing_values = np.nan , strategy = 'most_frequent')
# imp_mean.fit(Y)
# Y = imp_mean.transform(Y)

In [21]:
# print(X)
pd.isnull(X).sum()


Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
dtype: int64

In [22]:
# X.drop('Date',axis =1, inplace =True )
# print(X)   
#Already i did.

# Label encoding : (commonly used for Target Variable/categorical value)

In [23]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [24]:
print(Y)

0          No
1          No
2          No
3          No
4          No
         ... 
145455     No
145456     No
145457     No
145458     No
145459    Yes
Name: RainTomorrow, Length: 145460, dtype: object


In [25]:
le.fit(Y)

LabelEncoder()

In [26]:
Y = le.transform(Y)

In [27]:
print(len(Y))

145460


In [28]:
# print(data['RainTomorrow'].unique())
# print(data['RainTomorrow'].nunique())
# print(data['RainTomorrow'].value_counts())
# print(data['RainTomorrow'].isnull())
# print(data['RainTomorrow'].notnull())



# One Hot Encoding

In [29]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [30]:
ColumnTransformer = ColumnTransformer([('encoder', OneHotEncoder(),[0,6,8,9,20])],remainder = 'passthrough')

In [31]:
X = ColumnTransformer.fit_transform(X)

In [32]:
print(X.shape)  # priviously column is 23 but after OneHotEncoding it gets increases into 115

(145460, 115)


# OverSampling

In [33]:
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler 

In [34]:
from collections import Counter

In [35]:
Counter(Y)

Counter({0: 110315, 1: 35145})

In [36]:
rus = RandomOverSampler(random_state = 42)
X,Y = rus.fit_resample(X, Y)

# Train test split

In [37]:
import numpy as np
from sklearn.model_selection import train_test_split

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size =0.2, random_state =1)

In [39]:
print(X_train.shape)

(176504, 115)


# Feature Scaling

In [40]:
from sklearn import preprocessing

In [41]:
sc = preprocessing.StandardScaler(with_mean=False)

In [42]:
sc.fit(X_train)

StandardScaler(with_mean=False)

In [43]:
X_train = sc.transform(X_train)

In [44]:
print(X_train.shape)

(176504, 115)


In [45]:
X_test = sc.transform(X_test)

In [46]:
print(X_test.shape)

(44126, 115)
