# Clean up exercise 11/09 2024 by Ahmad Alkaseb

Status of each exercise:
* 25: DONE
* 26: DONE
* 27: DONE

In [3]:
# Import the needed libraries
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [4]:
# Load the data using pandas library
df = pd.read_csv('weather.csv')

In [5]:
# Create all the ordinal data to dummies, and removing the first column
df_dummies = pd.get_dummies(df, drop_first=True)

In [9]:
# Shuffle the dummies, and keep the random value (42) to always be the same.
df_shuffled = shuffle(df_dummies, random_state=42)

In [11]:
# Split the shuffled values into x and y
DV = 'Rain' # Save the rain column as DV (Dependent variabel)
X = df_shuffled.drop(DV, axis=1) # Convert all the others values to x-variabel. Then remove the DV column (axis = 1) from our values.
y = df_shuffled[DV] # Convert our DV (Dependent variabel) to be our y-variabels.

In [13]:
# Create our machine learning model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) 

In [15]:
# Instantiate logistic regression model variabel
model = LogisticRegression()

In [17]:
# Create the model using .fit to accept to arguments.
model.fit(X_train, y_train)

In [19]:
# Extract the interception and the a-coefficients.
intercept = model.intercept_
coefficients = model.coef_

In [21]:
# Convert the first row of a-coefficients into alist
coef_list = list(coefficients[0,:])

In [23]:
# put coefficients in a df with feature name
# Put the a-coefficients into a dataframe with two columns:
# Feature: name of the actual column
# A-coefficient: value of the a-coefficient
coef_df = pd.DataFrame({'Feature': list(X_train.columns),
                        'Coefficient': coef_list})

In [25]:
# Print out the dataframe
coef_df

Unnamed: 0,Feature,Coefficient
0,Temperature_c,6.145762
1,Humidity,-0.028203
2,Wind_Speed_kmh,-0.06838
3,Wind_Bearing_degrees,-0.002544
4,Visibility_km,0.057437
5,Pressure_millibars,0.000571
6,Description_Normal,0.191576
7,Description_Warm,0.094052


In [27]:
# Generate predicted probabilities of y
# model.predict_proba(X_test): Gets predicted probabilities for each class.
# [:,1]: Extracts the probabilities (1 = which column from the dataframe).
predicted_prob = model.predict_proba(X_test)[:,1]

In [29]:
# Generate predicted class labels for each instance
predicted_class = model.predict(X_test)

In [31]:
# Evaluate performance with confusion matrix
cm = pd.DataFrame(confusion_matrix(y_test, predicted_class))  # Convert confusion matrix to DataFrame
cm['Total'] = cm.sum(axis=1)  # Calculate row totals and add as a new column

# Create a DataFrame for the column totals and concatenate it
column_totals = pd.DataFrame(cm.sum(axis=0)).T  # Compute column totals and transpose to match DataFrame shape
column_totals.index = ['Total']  # Set index name for the column totals DataFrame
cm = pd.concat([cm, column_totals], ignore_index=True)  # Concatenate row totals and column totals DataFrames

# Rename columns and index
cm.columns = ['Predicted No', 'Predicted Yes', 'Total']  # Rename columns to represent predicted classes and totals
cm.index = ['Actual No', 'Actual Yes', 'Total']  # Rename index to represent actual classes and totals

In [33]:
# Print out the dataframe
cm

Unnamed: 0,Predicted No,Predicted Yes,Total
Actual No,377,6,383
Actual Yes,9,2908,2917
Total,386,2914,3300


In [35]:
# Generate a classification report
print(classification_report(y_test, predicted_class))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       383
           1       1.00      1.00      1.00      2917

    accuracy                           1.00      3300
   macro avg       0.99      0.99      0.99      3300
weighted avg       1.00      1.00      1.00      3300



In [43]:
# Instantiate a grid with the possible values for hyperparamters (see documentation)
grid = {'penalty': ['l1', 'l2'],
        'C': np.linspace(1, 10, 10)}

In [45]:
# Instantiate GridSearchCV model
model = GridSearchCV(LogisticRegression(solver='liblinear'), grid, scoring='f1', cv=5)

In [47]:
# Fit the gridsearch model
model.fit(X_train, y_train)



In [51]:
# Print the best parameters
best_parameters = model.best_params_
print(best_parameters)

{'C': 2.0, 'penalty': 'l1'}
