### Loading the Preprocessed Dataset

In [1]:
import numpy as np

# Function to load the dataset
def load_sparse_dataset(file_path):
    X = []
    y = []
    
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split()
            label = int(parts[0])  # The first element is the label (+1 or -1)
            features = np.zeros(8)  # Since we have 8 features in this dataset
            
            # Extract each feature's value
            for part in parts[1:]:
                index, value = part.split(':')
                features[int(index) - 1] = float(value)
            
            X.append(features)
            y.append(label)
    
    return np.array(X), np.array(y)

# Load the dataset
data_path = '/Users/chayonimeu/Documents/GitHub/DLF_Perceptron-to-predict-diabetes/diabetes_scale.txt'
X, y = load_sparse_dataset(data_path)

# Convert labels from {-1, 1} to {0, 1} if needed, as many algorithms expect binary labels in {0, 1}
y = np.where(y == -1, 0, 1)

print("Data loaded successfully!")

Data loaded successfully!


### Preprocessing the Data

In [2]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Training a Single-Layer Perceptron

In [3]:
from sklearn.linear_model import Perceptron

# Initialize the Perceptron model
perceptron_model = Perceptron(max_iter=1000, eta0=1.0, random_state=42)

# Train the model
perceptron_model.fit(X_train, y_train)

print("Model trained successfully!")

Model trained successfully!


### Evaluating the Model

In [4]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Make predictions on the test set
y_pred = perceptron_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Display the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Display the classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 0.7272727272727273
Confusion Matrix:
[[17 38]
 [ 4 95]]
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.31      0.45        55
           1       0.71      0.96      0.82        99

    accuracy                           0.73       154
   macro avg       0.76      0.63      0.63       154
weighted avg       0.75      0.73      0.69       154



### Using SMOTE to Handle Class Imbalance

In [6]:
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Initialize and train the perceptron model on resampled data
perceptron_model_smote = Perceptron(max_iter=1000, eta0=1.0, random_state=42)
perceptron_model_smote.fit(X_resampled, y_resampled)

# Predict on the test set
y_pred_smote = perceptron_model_smote.predict(X_test)

# Evaluate the model
accuracy_smote = accuracy_score(y_test, y_pred_smote)
conf_matrix_smote = confusion_matrix(y_test, y_pred_smote)
class_report_smote = classification_report(y_test, y_pred_smote)

# Display the results
print(f"Accuracy after SMOTE: {accuracy_smote}")
print("Confusion Matrix after SMOTE:")
print(conf_matrix_smote)
print("Classification Report after SMOTE:")
print(class_report_smote)

Accuracy after SMOTE: 0.7662337662337663
Confusion Matrix after SMOTE:
[[39 16]
 [20 79]]
Classification Report after SMOTE:
              precision    recall  f1-score   support

           0       0.66      0.71      0.68        55
           1       0.83      0.80      0.81        99

    accuracy                           0.77       154
   macro avg       0.75      0.75      0.75       154
weighted avg       0.77      0.77      0.77       154



### Using Class Weights to Handle Class Imbalance

In [7]:
# Initialize and train the perceptron model with class weights
perceptron_model_weighted = Perceptron(max_iter=1000, eta0=1.0, random_state=42, class_weight='balanced')
perceptron_model_weighted.fit(X_train, y_train)

# Predict on the test set
y_pred_weighted = perceptron_model_weighted.predict(X_test)

# Evaluate the model
accuracy_weighted = accuracy_score(y_test, y_pred_weighted)
conf_matrix_weighted = confusion_matrix(y_test, y_pred_weighted)
class_report_weighted = classification_report(y_test, y_pred_weighted)

# Display the results
print(f"Accuracy with Class Weights: {accuracy_weighted}")
print("Confusion Matrix with Class Weights:")
print(conf_matrix_weighted)
print("Classification Report with Class Weights:")
print(class_report_weighted)

Accuracy with Class Weights: 0.7077922077922078
Confusion Matrix with Class Weights:
[[27 28]
 [17 82]]
Classification Report with Class Weights:
              precision    recall  f1-score   support

           0       0.61      0.49      0.55        55
           1       0.75      0.83      0.78        99

    accuracy                           0.71       154
   macro avg       0.68      0.66      0.67       154
weighted avg       0.70      0.71      0.70       154



### Hyperparameter Tuning

In [8]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'eta0': [0.1, 0.5, 1.0, 5.0],  # Different learning rates
    'max_iter': [500, 1000, 2000]   # Different iterations
}

# Initialize Perceptron with grid search
perceptron_model_tuned = Perceptron(random_state=42)

# Perform grid search
grid_search = GridSearchCV(perceptron_model_tuned, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Predict on the test set with the best model
y_pred_best = best_model.predict(X_test)

# Evaluate the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)
class_report_best = classification_report(y_test, y_pred_best)

# Display the results
print(f"Best Parameters: {best_params}")
print(f"Accuracy after Hyperparameter Tuning: {accuracy_best}")
print("Confusion Matrix after Tuning:")
print(conf_matrix_best)
print("Classification Report after Tuning:")
print(class_report_best)

Best Parameters: {'eta0': 0.1, 'max_iter': 500}
Accuracy after Hyperparameter Tuning: 0.7402597402597403
Confusion Matrix after Tuning:
[[18 37]
 [ 3 96]]
Classification Report after Tuning:
              precision    recall  f1-score   support

           0       0.86      0.33      0.47        55
           1       0.72      0.97      0.83        99

    accuracy                           0.74       154
   macro avg       0.79      0.65      0.65       154
weighted avg       0.77      0.74      0.70       154



### Evaluate with Cross-Validation

In [9]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cross_val_scores = cross_val_score(perceptron_model_smote, X_train, y_train, cv=5)

# Display cross-validation results
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Average Cross-Validation Score: {cross_val_scores.mean()}")

Cross-Validation Scores: [0.66666667 0.77235772 0.67479675 0.73170732 0.40983607]
Average Cross-Validation Score: 0.6510729041716647


### Libraries

In [1]:
from sklearn.datasets import load_svmlight_file
import numpy as np
import pandas as pd

from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Load the Dataset

In [2]:
# Path to the dataset
data_path = '/Users/chayonimeu/Documents/GitHub/DLF_Perceptron-to-predict-diabetes/diabetes_scale.txt'

# Load the dataset (X: features, y: labels)
X, y = load_svmlight_file(data_path)

# Convert to dense format
X_dense = X.toarray()

# Display the first 5 examples (features and labels)
print("Features (first 5):\n", X_dense[:5])
print("Labels (first 5):\n", y[:5])

Features (first 5):
 [[-0.294118    0.487437    0.180328   -0.292929   -1.          0.00149028
  -0.53117    -0.0333333 ]
 [-0.882353   -0.145729    0.0819672  -0.414141   -1.         -0.207153
  -0.766866   -0.666667  ]
 [-0.0588235   0.839196    0.0491803  -1.         -1.         -0.305514
  -0.492741   -0.633333  ]
 [-0.882353   -0.105528    0.0819672  -0.535354   -0.777778   -0.162444
  -0.923997   -1.        ]
 [-1.          0.376884   -0.344262   -0.292929   -0.602837    0.28465
   0.887276   -0.6       ]]
Labels (first 5):
 [-1.  1. -1.  1. -1.]


#### Common Data Preprocessing

In [3]:
# Convert X_dense and y into a Pandas DataFrame for easier analysis
df_features = pd.DataFrame(X_dense)
df_labels = pd.DataFrame(y, columns=['Label'])

# Concatenate features and labels for easy reference
df = pd.concat([df_features, df_labels], axis=1)

# 1. Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)

# 2. Check data types
print("\nData types of the features:\n", df.dtypes)

# 3. Ensure feature scaling (checking min and max values)
print("\nFeature value ranges (min, max):\n", df_features.describe().loc[['min', 'max']])

# 4. Check for class imbalance
class_counts = df_labels['Label'].value_counts()
print("\nClass distribution:\n", class_counts)

Missing values in each column:
 0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        0
Label    0
dtype: int64

Data types of the features:
 0        float64
1        float64
2        float64
3        float64
4        float64
5        float64
6        float64
7        float64
Label    float64
dtype: object

Feature value ranges (min, max):
        0    1    2    3    4    5    6    7
min -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0
max  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0

Class distribution:
 Label
 1.0    500
-1.0    268
Name: count, dtype: int64


In [4]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_dense, y, test_size=0.2, random_state=42)

# Initialize the Perceptron model with class weights to balance the classes
perceptron = Perceptron(class_weight='balanced')

# Train the model
perceptron.fit(X_train, y_train)

# Predict on the test set
y_pred = perceptron.predict(X_test)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6948051948051948


In [5]:
perceptron = Perceptron(eta0=0.01, max_iter=1000, class_weight='balanced')

In [6]:
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation
cv_scores = cross_val_score(perceptron, X_dense, y, cv=5)
print("Cross-validation scores:", cv_scores)
print("Average cross-validation score:", np.mean(cv_scores))

Cross-validation scores: [0.74675325 0.74025974 0.62987013 0.75816993 0.34640523]
Average cross-validation score: 0.644291656056362


In [7]:
from sklearn.linear_model import LogisticRegression

# Initialize and train a logistic regression model
logistic_reg = LogisticRegression(class_weight='balanced')
logistic_reg.fit(X_train, y_train)

# Evaluate the accuracy
y_pred_log = logistic_reg.predict(X_test)
accuracy_log = accuracy_score(y_test, y_pred_log)
print("Logistic Regression Accuracy:", accuracy_log)

Logistic Regression Accuracy: 0.7077922077922078


In [8]:
from sklearn.metrics import confusion_matrix

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[41 14]
 [33 66]]


Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.3-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.12.3-py3-none-any.whl (258 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.3/258.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [11]:
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
import numpy as np

# Assuming your X_dense and y are already loaded from the dataset

# Step 1: Handle Class Imbalance with SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_dense, y)

# Step 2: Split the Dataset into Training and Test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Step 3: Initialize and Train the Perceptron Model
perceptron = Perceptron(class_weight='balanced', max_iter=1000, eta0=0.01)
perceptron.fit(X_train, y_train)

# Step 4: Evaluate the Model

# Predict on the test set
y_pred = perceptron.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Classification Report (Precision, Recall, F1-score)
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

# Cross-Validation (5-fold CV) on the resampled dataset
cv_scores = cross_val_score(perceptron, X_resampled, y_resampled, cv=5)
print("Cross-validation scores:", cv_scores)
print("Average cross-validation score:", np.mean(cv_scores))

Accuracy: 0.7
Confusion Matrix:
 [[55 46]
 [14 85]]
Classification Report:
               precision    recall  f1-score   support

        -1.0       0.80      0.54      0.65       101
         1.0       0.65      0.86      0.74        99

    accuracy                           0.70       200
   macro avg       0.72      0.70      0.69       200
weighted avg       0.72      0.70      0.69       200

Cross-validation scores: [0.585 0.685 0.52  0.62  0.57 ]
Average cross-validation score: 0.596
