In [6]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7
Note: you may need to restart the kernel to use updated packages.


In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [50]:
# data packages
import pandas as pd
import numpy as np

# preprocessing
from sklearn.preprocessing import StandardScaler

# splits and hyper paramater tuning
from sklearn.model_selection import train_test_split, GridSearchCV

# models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# metrics for model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

In [49]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
spambase = fetch_ucirepo(id=94) 
  
# data (as pandas dataframes) 
X = spambase.data.features 
y = spambase.data.targets 
  
pd.concat([X, y], axis=1).head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,Class
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [66]:
unique_elements, counts = np.unique(y, return_counts=True)

# Print the results
for element, count in zip(unique_elements, counts):
    print(f"Element {element}: {count} occurrences")

Element 0: 2788 occurrences
Element 1: 1813 occurrences


In [52]:
# Standardize data for logistic regression

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler to the data and transform it
X_scaled = scaler.fit_transform(X)

# Convert the numpy array back to a DataFrame with the same columns as the original
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [61]:
# Ensure y is a 1D array
y = y.ravel()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=47)

# Initialize the Logistic Regression model
logreg = LogisticRegression()

# Define the hyperparameter grid for GridSearchCV
param_grid = [
    {
        'penalty': ['l2'],
        'C': [0.001, 0.01, 0.1],
        'solver': ['newton-cg', 'lbfgs', 'sag'],
        'max_iter': [2000, 3000, 5000]
    },
    {
        'penalty': ['l1', 'l2'],
        'C': [0.001, 0.01, 0.1],
        'solver': ['saga'],
        'max_iter': [2000, 3000, 5000]
    }
]

# Initialize GridSearchCV with the logistic regression model and the parameter grid
grid_search = GridSearchCV(
    estimator=logreg,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',  # Scoring metric
    n_jobs=-1  # Use all available CPU cores
)

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

In [62]:
# Get tuned paramaters
print(f'{grid_search.best_params_=}')
print(f'{grid_search.best_score_=}')

grid_search.best_params_={'C': 0.1, 'max_iter': 2000, 'penalty': 'l2', 'solver': 'newton-cg'}
grid_search.best_score_=np.float64(0.9182065217391304)


In [63]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=47)

# logistic regression with tuned parameters
logreg = LogisticRegression(C=0.001, max_iter=2000, solver='saga', penalty='l2')

# fit the data to the model
logreg.fit(X_train, y_train)

# get predictions on testing and training sets
y_pred = logreg.predict(X_test)
y_pred_train = logreg.predict(X_train)

# test overfitting
print(f'Training precision: {precision_score(y_train, y_pred_train)}')
print(f'Testing precision: {precision_score(y_test, y_pred)}')

Training precision: 0.9256342957130359
Testing precision: 0.9379310344827586


In [78]:
coefficients = logreg.coef_[0]
features = X.columns 

plt.figure(figsize=(10, 10))
plt.barh(features, coefficients, color='b')
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.title('Logistic Regression Coefficients')
plt.show()

NameError: name 'plt' is not defined