In [None]:
#LOGISTIC REGRESSION
#DOWNLOADING DATASETS

In [None]:
!pip install opendatasets --upgrade --quiet

In [None]:
import opendatasets as od # Importing the 'opendatasets' library and giving it the alias 'od'

In [None]:
od.version()

In [None]:
dataset_url = 'https://www.kaggle.com/jsphyg/weather-dataset-rattle-package'

In [None]:
od.download(dataset_url) # Downloading the dataset from the provided URL

In [None]:
od.download(dataset_url)

In [None]:
import os # Importing the built-in 'os' module to interact with the operating system

In [None]:
data_dir = './weather-dataset-rattle-package' # Defining the directory path where the weather dataset will be stored or accessed

In [None]:
os.listdir(data_dir) # Listing all files and directories in the specified data directory

In [None]:
train_csv = data_dir + '/weatherAUS.csv' # Creating the full path to the CSV file containing the weather dataset

In [None]:
!pip install pandas --quiet
!pip install plotly matplotlib seaborn --quiet

In [None]:
import pandas as pd
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import numpy as np

In [None]:
raw_df = pd.read_csv(train_csv)
raw_df

In [None]:
raw_df.info()

In [None]:
raw_df.dropna(subset=['RainToday','RainTomorrow'],inplace=True)
raw_df.info()

In [None]:
### Exploratory Data Analysis and Visualization


sns.set_style('darkgrid') # Setting the visual style for seaborn plots to 'darkgrid' (adds a dark grid to the background)
matplotlib.rcParams['font.size'] = 14 # Setting default font size for all matplotlib plots
matplotlib.rcParams['figure.figsize'] = (10, 6) # Setting default figure size (width=10, height=6 inches) for matplotlib plots
matplotlib.rcParams['figure.facecolor'] = '#00000000' 
# Setting the default background color of the figure to transparent

In [None]:
px.histogram(raw_df, x='Location', title='Location vs. Rainy Days', color='RainToday') # Creating an interactive histogram using Plotly to show the number of rainy and non-rainy days by location

In [None]:
px.histogram(raw_df, 
             x='Temp3pm', 
             title='Temperature at 3 pm vs. Rain Tomorrow', 
             color='RainTomorrow')

In [None]:
px.histogram(raw_df, 
             x='RainTomorrow', 
             color='RainToday', 
             title='Rain Tomorrow vs. Rain Today') #predicting no rain tommorrow because it didnt rainy today is easier, 92k instances

In [None]:
px.scatter(raw_df.sample(2000), 
           title='Min Temp. vs Max Temp.',
           x='MinTemp', 
           y='MaxTemp', 
           color='RainToday')

In [None]:
px.scatter(raw_df.sample(2000), 
           title='Temp (3 pm) vs. Humidity (3 pm)',
           x='Temp3pm',
           y='Humidity3pm',
           color='RainTomorrow')

In [None]:
# Working with a Sample

In [None]:
use_sample = False # Setting a flag to indicate whether to use a sample of the dataset or the full dataset #False: Means the full dataset will be used instead of a smaller sample.

In [None]:
sample_fraction = 0.1

In [None]:
if use_sample:
    raw_df = raw_df.sample(frac=sample_fraction).copy() # Randomly selects a fraction of the rows from the DataFrame raw_df. The fraction is determined by the value of sample_fraction

In [None]:
#Training, Validation and Test Sets

In [None]:
!pip install scikit-learn --upgrade --quiet

In [None]:
from sklearn.model_selection import train_test_split # Importing the train_test_split function from scikit-learn to split data into training and testing sets


In [None]:
train_val_df, test_df = train_test_split(raw_df, test_size=0.2, random_state=42) # Splitting the original dataset into training+validation (80%) and test (20%) sets
train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=42)# Further splitting the training+validation set into training (75%) and validation (25%) sets

In [None]:
print('train_df.shape :', train_df.shape) # Printing the number of rows and columns in the training, validation, and test datasets
print('val_df.shape :', val_df.shape)
print('test_df.shape :', test_df.shape)

In [None]:
plt.title('No. of Rows per Year')
sns.countplot(x=pd.to_datetime(raw_df.Date).dt.year); # Creating a count plot to show the number of data entries (rows) per year in the dataset

In [None]:
year = pd.to_datetime(raw_df.Date).dt.year # Extracting the year from the 'Date' column

train_df = raw_df[year < 2015] # Creating the training set with data from before 2015
val_df = raw_df[year == 2015]# Creating the validation set with data from the year 2015
test_df = raw_df[year > 2015]# Creating the test set with data from after 2015

In [None]:
print('train_df.shape :', train_df.shape)
print('val_df.shape :', val_df.shape)
print('test_df.shape :', test_df.shape)

In [None]:
train_df

In [None]:
val_df

In [None]:
test_df

In [None]:
#Identifying Input and Target Columns

In [None]:
input_cols = list(train_df.columns)[1:-1] #[1:-1] slices the list of column names: Starts from the second column (index 1) Goes up to but not including the last column
target_col = 'RainTomorrow'

In [None]:
print(input_cols)

In [None]:
target_col

In [None]:
train_inputs = train_df[input_cols].copy() # Extracting the input features and target variable from the train DataFrame
train_targets = train_df[target_col].copy() #copy(): Creates a copy of the selected data. This prevents any changes to test_inputs from affecting the original test_df DataFrame.

In [None]:
val_inputs = val_df[input_cols].copy() # Extracting the input features and target variable from the validation DataFrame
val_targets = val_df[target_col].copy()

In [None]:
test_inputs = test_df[input_cols].copy() # Extracting the input features and target variable from the test DataFrame
test_targets = test_df[target_col].copy()

In [None]:
train_inputs

In [None]:
train_targets

In [None]:
numeric_cols = train_inputs.select_dtypes(include=np.number).columns.tolist() # Selecting numeric columns from the training input DataFrame
categorical_cols = train_inputs.select_dtypes('object').columns.tolist() # Selecting categorical columns from the training input DataFrame #Selects columns from the train_inputs DataFrame that contain categorical data (typically stored as strings or object dtype).
numeric_cols

In [None]:
categorical_cols

In [None]:
train_inputs[numeric_cols].describe()

In [None]:
train_inputs[categorical_cols].nunique()

In [None]:
#INPUTING MISSING VALUES


In [None]:
raw_df[numeric_cols].isna().sum()

In [None]:
train_inputs[numeric_cols].isna().sum()

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer(strategy = 'mean')

In [None]:
imputer.fit(raw_df[numeric_cols])

In [None]:
list(imputer.statistics_)

In [None]:
train_inputs[numeric_cols] = imputer.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = imputer.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = imputer.transform(test_inputs[numeric_cols])

In [None]:
train_inputs[numeric_cols].isna().sum()

In [None]:
#Scaling Numeric Features

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
?MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
scaler.fit(raw_df[numeric_cols])

In [None]:
print('Minimum:')
list(scaler.data_min_) # Printing the minimum values learned by the scaler for each feature

In [None]:
print('Maximum:')
list(scaler.data_max_)

In [None]:
train_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = scaler.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

In [None]:
train_inputs[numeric_cols].describe()

In [None]:
train_inputs[numeric_cols]

In [None]:
#Encoding Categorical Data

In [None]:
raw_df[categorical_cols].nunique()

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [None]:
encoder.fit(raw_df[categorical_cols])

In [None]:
encoder.categories_

In [None]:
encoded_cols = list(encoder.get_feature_names_out(categorical_cols)) # Getting the names of the encoded features from the encoder and converting them to a list
print(encoded_cols)

In [None]:
# Step 1: Fill missing values in the original DataFrame columns
train_inputs[categorical_cols] = train_inputs[categorical_cols].fillna('unknown')
val_inputs[categorical_cols] = val_inputs[categorical_cols].fillna('unknown')
test_inputs[categorical_cols] = test_inputs[categorical_cols].fillna('unknown')

In [None]:
train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
val_inputs[encoded_cols] = encoder.transform(val_inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
test_inputs

In [None]:
#Saving Processed Data to Disk

In [None]:
print('train_inputs:', train_inputs.shape)
print('train_targets:', train_targets.shape)
print('val_inputs:', val_inputs.shape)
print('val_targets:', val_targets.shape)
print('test_inputs:', test_inputs.shape)
print('test_targets:', test_targets.shape)

In [None]:
!pip install pyarrow --quiet

In [None]:
train_inputs.to_parquet('train_inputs.parquet')# Saving the training input features to a Parquet file
val_inputs.to_parquet('val_inputs.parquet')
test_inputs.to_parquet('test_inputs.parquet')

In [None]:
%%time
pd.DataFrame(train_targets).to_parquet('train_targets.parquet')
pd.DataFrame(val_targets).to_parquet('val_targets.parquet')
pd.DataFrame(test_targets).to_parquet('test_targets.parquet')

In [None]:
#reading parquet back

In [None]:
%%time

train_inputs = pd.read_parquet('train_inputs.parquet')
val_inputs = pd.read_parquet('val_inputs.parquet')
test_inputs = pd.read_parquet('test_inputs.parquet')

train_targets = pd.read_parquet('train_targets.parquet')[target_col] # Reading the target variables from the Parquet files and selecting the 'target_col'
val_targets = pd.read_parquet('val_targets.parquet')[target_col]
test_targets = pd.read_parquet('test_targets.parquet')[target_col]

In [None]:
print('train_inputs:', train_inputs.shape)
print('train_targets:', train_targets.shape)
print('val_inputs:', val_inputs.shape)
print('val_targets:', val_targets.shape)
print('test_inputs:', test_inputs.shape)
print('test_targets:', test_targets.shape)

In [None]:
val_inputs

In [None]:
val_targets

In [None]:
#Training a Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression(solver='liblinear')

In [None]:
#?LogisticRegression

In [None]:
model.fit(train_inputs[numeric_cols + encoded_cols], train_targets)

In [None]:
print(numeric_cols + encoded_cols)

In [None]:
print(model.coef_.tolist())

In [None]:
print(model.intercept_)

In [None]:
n = len(model.coef_.tolist())

In [None]:
weights_df = pd.DataFrame({
    'feature' : (numeric_cols + encoded_cols),
    'weight' : model.coef_.tolist()[0]})

In [None]:
weights_df

In [None]:
plt.figure(figsize=(10,50))

In [None]:
sns.barplot(data=weights_df, x='weight', y ='feature')

In [None]:
sns.barplot(data=weights_df.sort_values('weight', ascending=False).head(10), x='weight', y ='feature')

In [None]:
#Making Predictions and Evaluating the Model

In [None]:
X_train = train_inputs[numeric_cols + encoded_cols]
X_val = val_inputs[numeric_cols + encoded_cols]
X_test = test_inputs[numeric_cols + encoded_cols]

In [None]:
train_preds = model.predict(X_train)

In [None]:
train_preds

In [None]:
train_targets

In [None]:
train_probs = model.predict_proba(X_train)
train_probs #probabilistic prediction using predict_proba

In [None]:
model.classes_

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(train_targets, train_preds) #The model achieves an accuracy of 85.1% on the training set

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(train_targets, train_preds, normalize='true')

In [None]:
def predict_and_plot(inputs, targets, name=''):  #define a helper function to generate predictions, compute the accuracy score and plot a confusion matrix for a given st of inputs.


    preds = model.predict(inputs) # Use the trained model to make predictions on the input data
    
    accuracy = accuracy_score(targets, preds)    # Calculate accuracy between actual and predicted values
    print("Accuracy: {:.2f}%".format(accuracy * 100)) #formats and prints the model's accuracy as a percentage with two decimal places
    
    cf = confusion_matrix(targets, preds, normalize='true')   # Create a heatmap from the confusion matrix
    plt.figure()
    sns.heatmap(cf, annot=True)
    plt.xlabel('Prediction')
    plt.ylabel('Target')
    plt.title('{} Confusion Matrix'.format(name));
    
    return preds

In [None]:
train_preds = predict_and_plot(X_train, train_targets, 'Training')

In [None]:
val_preds = predict_and_plot(X_val, val_targets, 'Validatiaon')

In [None]:
test_preds = predict_and_plot(X_test, test_targets, 'Test')

In [None]:
#The accuracy of the model on the test and validation set are above 84%, which suggests that our model generalizes well to data it hasn't seen before.

In [None]:
#Let's create two models: one that guesses randomly and another that always return "No". Both of these models completely ignore the inputs given to them.

In [None]:
def random_guess(inputs):
    return np.random.choice(["No", "Yes"], len(inputs))

In [None]:
def all_no(inputs):
    return np.full(len(inputs), "No")

In [None]:
accuracy_score(test_targets, random_guess(X_test))

In [None]:
accuracy_score(test_targets, all_no(X_test)) #Our random model achieves an accuracy of 50% and our "always No" model achieves an accuracy of 77%.

In [None]:
#Making Predictions on a Single Input

In [None]:
new_input = {'Date': '2021-06-19',
             'Location': 'Katherine',
             'MinTemp': 23.2,
             'MaxTemp': 33.2,
             'Rainfall': 10.2,
             'Evaporation': 4.2,
             'Sunshine': np.nan,
             'WindGustDir': 'NNW',
             'WindGustSpeed': 52.0,
             'WindDir9am': 'NW',
             'WindDir3pm': 'NNE',
             'WindSpeed9am': 13.0,
             'WindSpeed3pm': 20.0,
             'Humidity9am': 89.0,
             'Humidity3pm': 58.0,
             'Pressure9am': 1004.8,
             'Pressure3pm': 1001.5,
             'Cloud9am': 8.0,
             'Cloud3pm': 5.0,
             'Temp9am': 25.7,
             'Temp3pm': 33.0,
             'RainToday': 'Yes'}

In [None]:
new_input_df = pd.DataFrame([new_input]) #the first step is to convert the dictionary into a Pandas dataframe,


In [None]:
new_input_df

In [None]:
new_input_df[numeric_cols] = imputer.transform(new_input_df[numeric_cols])
new_input_df[numeric_cols] = scaler.transform(new_input_df[numeric_cols])
new_input_df[encoded_cols] = encoder.transform(new_input_df[categorical_cols])

In [None]:
X_new_input = new_input_df[numeric_cols + encoded_cols]
X_new_input

In [None]:
prediction = model.predict(X_new_input)[0]
prediction

In [None]:
prob = model.predict_proba(X_new_input)[0]


In [None]:
prob #Looks like our model isn't too confident about its prediction!

In [None]:
def predict_input(single_input): #Let's define a helper function to make predictions for individual inputs.
    input_df = pd.DataFrame([single_input])
    input_df[numeric_cols] = imputer.transform(input_df[numeric_cols])
    input_df[numeric_cols] = scaler.transform(input_df[numeric_cols])
    input_df[encoded_cols] = encoder.transform(input_df[categorical_cols])
    X_input = input_df[numeric_cols + encoded_cols]
    pred = model.predict(X_input)[0]
    prob = model.predict_proba(X_input)[0][list(model.classes_).index(pred)]
    return pred, prob

In [None]:
new_input = {'Date': '2021-06-19',
             'Location': 'Launceston',
             'MinTemp': 23.2,
             'MaxTemp': 33.2,
             'Rainfall': 10.2,
             'Evaporation': 4.2,
             'Sunshine': np.nan,
             'WindGustDir': 'NNW',
             'WindGustSpeed': 52.0,
             'WindDir9am': 'NW',
             'WindDir3pm': 'NNE',
             'WindSpeed9am': 13.0,
             'WindSpeed3pm': 20.0,
             'Humidity9am': 89.0,
             'Humidity3pm': 58.0,
             'Pressure9am': 1004.8,
             'Pressure3pm': 1001.5,
             'Cloud9am': 8.0,
             'Cloud3pm': 5.0,
             'Temp9am': 25.7,
             'Temp3pm': 33.0,
             'RainToday': 'Yes'}

In [None]:
predict_input(new_input)  #Try changing the values in new_input and observe how the predictions and probabilities change. Try different values of location, temperature, humidity, pressure etc

In [None]:
#Saving and Loading Trained Models

In [None]:
import joblib

In [None]:
aussie_rain = {
    'model': model,
    'imputer': imputer,
    'scaler': scaler,
    'encoder': encoder,
    'input_cols': input_cols,
    'target_col': target_col,
    'numeric_cols': numeric_cols,
    'categorical_cols': categorical_cols,
    'encoded_cols': encoded_cols
}

In [None]:
joblib.dump(aussie_rain, 'aussie_rain.joblib') #We can now save this to a file using `joblib.dump`

In [None]:
aussie_rain2 = joblib.load('aussie_rain.joblib')# load


In [None]:
test_preds2 = aussie_rain2['model'].predict(X_test)
accuracy_score(test_targets, test_preds2)