In [None]:
import pandas as pd
import numpy as np

In [None]:
weather_df = pd.read_csv('./weatherAUS.csv')
weather_df

In [None]:
weather_df.info()

In [None]:
weather_df.isnull().sum().sort_values(ascending=False)

In [None]:
df_not_RainTomorrow_data = weather_df[weather_df['RainTomorrow'].isnull()]
df_no_RainTomorrow = weather_df.drop(df_not_RainTomorrow_data.index , axis=0)
df_no_RainTomorrow.isnull().sum()

In [None]:
weather_df_RainTomorrow = df_no_RainTomorrow;

**rows with missing target is not useful**

In [None]:
# drop all rows wheter either RainToday and RainTomorrow are null
weather_df = weather_df.dropna(subset=['RainToday', 'RainTomorrow'])
weather_df

## **exploring datasets**

In [None]:
import plotly.express as px;
import matplotlib;
import matplotlib.pyplot as plt;
import seaborn as sns;

sns.set_style("darkgrid")
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [None]:
print(weather_df['Location'].nunique())
weather_df.value_counts('Location')

In [None]:
px.histogram(weather_df , x='Location' , title='Location vs Rainy days',color='RainToday')

In [None]:
px.histogram(weather_df, x='Temp3pm' ,color='RainToday', title='Temp3pm vs Rainy days')

In [None]:
px.histogram(weather_df, x='Temp3pm' ,color='RainTomorrow', title='Temp3pm vs Rainy days')

In [None]:
px.histogram(weather_df , x='RainTomorrow', color='RainToday', title='RainTomorrow vs Rainy days')
# imbalance datasets in RainTomorrow we dont have equal number of Yes and No for rain tomorrow is known as imbalanced dataset


In [None]:
px.scatter(weather_df.sample(2000) , x='MinTemp' , y='MaxTemp' , color='RainTomorrow',color_discrete_sequence=['blue', 'red'])

In [None]:
from sklearn.model_selection import train_test_split
train_val_df, test_df = train_test_split(weather_df, test_size=0.2, random_state=42) # 80% of data is used in training and validation and test_df: 20% of original data
train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=42) # train_df: 75% of 80% → 60% of original data and val_df: 25% of 80% of original data → 20% of original data
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

In [None]:
new_input = pd.DataFrame([new_input])
new_input

In [None]:
# add this newinput to the weatherdf
weather_df = pd.concat([weather_df, new_input], ignore_index=True)
weather_df

In [None]:
weather_df.value_counts(pd.to_datetime(weather_df.Date).dt.year)

In [None]:
plt.title('Distribution of dates')
sns.countplot(x=pd.to_datetime(weather_df.Date).dt.year)

In [None]:
year = pd.to_datetime(weather_df.Date).dt.year
train_df = weather_df[year < 2015]
val_df = weather_df[year == 2015]
test_df = weather_df[year > 2015]
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)


In [None]:
train_df

In [None]:
val_df

In [None]:
test_df

In [None]:
input_cols = list(train_df.columns)[1:-1]
input_cols
target_col = 'RainTomorrow'
target_col

In [None]:
train_inputs = train_df[input_cols].copy()
train_target = train_df[target_col].copy()


In [None]:
val_inputs = val_df[input_cols].copy()
val_target = val_df[target_col].copy()

In [None]:
test_inputs = test_df[input_cols].copy()
test_target = test_df[target_col].copy()

In [None]:
train_df.dtypes

In [None]:
numeric_cols = train_inputs.select_dtypes(include=np.number).columns.tolist();
print(numeric_cols)
categorical_cols = train_inputs.select_dtypes(exclude=np.number).columns.tolist();
categorical_cols

In [None]:
train_inputs.describe()

In [None]:
train_inputs[categorical_cols].nunique()

**Machine learning models can't work with missing numerical data. The process of filling missing values is called imputation.**

In [None]:
# replace missing values with mean values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
weather_df.isnull().sum()

In [None]:
train_inputs.isnull().sum()

In [None]:
weather_df.describe()

In [None]:
imputer.fit(weather_df[numeric_cols])
list(imputer.statistics_)

In [None]:
train_inputs[numeric_cols] = imputer.transform(train_inputs[numeric_cols])
train_inputs.isnull().sum()
val_inputs[numeric_cols] = imputer.transform(val_inputs[numeric_cols])
val_inputs.isnull().sum()
test_inputs[numeric_cols] = imputer.transform(test_inputs[numeric_cols])
test_inputs.isnull().sum()
#The missing values are now filled in with the mean of each column.

## **scaling numeric features**

In [None]:
weather_df[numeric_cols].describe()

**Let's use `MinMaxScaler` from `sklearn.preprocessing` to scale values to the $(0,1)$ range.**

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(weather_df[numeric_cols])


In [None]:
print(list(scaler.data_min_))
print(list(scaler.data_max_))


In [None]:
train_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = scaler.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])


### **encoding categorical data**

In [None]:
weather_df[categorical_cols].nunique()

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
encoder.fit(weather_df[categorical_cols])
encoder.categories_


In [None]:
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))
print(encoded_cols)

In [None]:
encoded_arr = encoder.transform(train_inputs[categorical_cols])
print(encoded_arr.toarray())
encoded_cols = encoder.get_feature_names_out(categorical_cols)  # get correct column names
print(list(encoded_cols))
encoded_df = pd.DataFrame(encoded_arr.toarray(), columns=encoded_cols, index=train_inputs.index)
print(encoded_df)
# Now assign correctly
train_inputs[encoded_cols] = encoded_df
train_inputs


In [None]:
encoded_arr = encoder.transform(val_inputs[categorical_cols])
encoded_cols = encoder.get_feature_names_out(categorical_cols)
encoded_df = pd.DataFrame(encoded_arr.toarray() , columns=encoded_cols , index = val_inputs.index)
val_inputs[encoded_cols] = encoded_df
test_inputs[encoded_cols] = pd.DataFrame(encoder.transform(test_inputs[categorical_cols]).toarray() , columns=encoded_cols, index=test_inputs.index)
test_inputs

In [None]:
print('train_inputs:', train_inputs.shape)
print('train_targets:', train_target.shape)
print('val_inputs:', val_inputs.shape)
print('val_targets:', val_target.shape)
print('test_inputs:', test_inputs.shape)
print('test_targets:', test_target.shape)

## Saving Processed Data to Disk

It can be useful to save processed data to disk, especially for really large datasets, to avoid repeating the preprocessing steps every time you start the Jupyter notebook. The parquet format is a fast and efficient format for saving and loading Pandas dataframes.

In [None]:
# pip install pyarrow
train_inputs.to_parquet('train_inputs.parquet', engine='pyarrow')
val_inputs.to_parquet('val_inputs.parquet', engine='pyarrow')
test_inputs.to_parquet('test_inputs.parquet', engine='pyarrow')

In [None]:
pd.DataFrame(train_target).to_parquet('train_target.parquet', engine='pyarrow')
pd.DataFrame(val_target).to_parquet('val_target.parquet', engine='pyarrow')
pd.DataFrame(test_target).to_parquet('test_target.parquet', engine='pyarrow')

In [None]:
# we can use that parquet data
pd.read_parquet('./train_inputs.parquet')

In [None]:
pd.read_parquet('./train_target.parquet')

In [None]:
pd.read_parquet('./test_inputs.parquet')

In [None]:
pd.read_parquet('./test_target.parquet')

In [None]:
pd.read_parquet('./val_inputs.parquet')

In [None]:
pd.read_parquet('./val_target.parquet')

In [116]:
aussie_rain = {
    'imputer': imputer,
    'scaler': scaler,
    'encoder': encoder,
    'input_cols': input_cols,
    'target_col': target_col,
    'numeric_cols': numeric_cols,
    'categorical_cols': categorical_cols,
    'encoded_cols': encoded_cols
}

In [118]:
import joblib
joblib.dump(aussie_rain , 'aussie_rain.joblib')


['aussie_rain.joblib']