## Data Preprocessing

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split

# Define the path to your dataset
file_path = '/Users/chayonimeu/Documents/GitHub/DLF_Perceptron-to-predict-diabetes/diabetes_scale.txt'

# Load the dataset into a pandas DataFrame (space-separated)
data = pd.read_csv(file_path, delimiter=' ', header=None)

# Inspect the first few rows and the shape of the dataset to understand its structure
print(data.head())  # Display the first few rows
print(f'Dataset shape: {data.shape}')  # Check the dataset dimensions

   0             1            2            3            4            5  \
0 -1   1:-0.294118   2:0.487437   3:0.180328  4:-0.292929         5:-1   
1  1   1:-0.882353  2:-0.145729  3:0.0819672  4:-0.414141         5:-1   
2 -1  1:-0.0588235   2:0.839196  3:0.0491803         4:-1         5:-1   
3  1   1:-0.882353  2:-0.105528  3:0.0819672  4:-0.535354  5:-0.777778   
4 -1          1:-1   2:0.376884  3:-0.344262  4:-0.292929  5:-0.602837   

              6            7             8   9  
0  6:0.00149028   7:-0.53117  8:-0.0333333 NaN  
1   6:-0.207153  7:-0.766866   8:-0.666667 NaN  
2   6:-0.305514  7:-0.492741   8:-0.633333 NaN  
3   6:-0.162444  7:-0.923997          8:-1 NaN  
4     6:0.28465   7:0.887276        8:-0.6 NaN  
Dataset shape: (768, 10)


### Cleaning the Dataset

In [2]:
# Function to clean the feature columns by removing the index prefixes like '1:', '2:', etc.
def clean_data(data):
    cleaned_data = data.applymap(lambda x: float(str(x).split(':')[-1]) if isinstance(x, str) and ':' in x else x)
    return cleaned_data

# Apply the cleaning function to the entire dataset, except the first column (labels)
cleaned_data = clean_data(data.iloc[:, 1:])

# Reattach the label column (first column) to the cleaned dataset
cleaned_data.insert(0, 'Label', data.iloc[:, 0])

# Inspect the cleaned dataset to ensure no data is lost
print(cleaned_data.head())
print(f'Cleaned data shape: {cleaned_data.shape}')

   Label         1         2         3         4         5         6  \
0     -1 -0.294118  0.487437  0.180328 -0.292929 -1.000000  0.001490   
1      1 -0.882353 -0.145729  0.081967 -0.414141 -1.000000 -0.207153   
2     -1 -0.058824  0.839196  0.049180 -1.000000 -1.000000 -0.305514   
3      1 -0.882353 -0.105528  0.081967 -0.535354 -0.777778 -0.162444   
4     -1 -1.000000  0.376884 -0.344262 -0.292929 -0.602837  0.284650   

          7         8   9  
0 -0.531170 -0.033333 NaN  
1 -0.766866 -0.666667 NaN  
2 -0.492741 -0.633333 NaN  
3 -0.923997 -1.000000 NaN  
4  0.887276 -0.600000 NaN  
Cleaned data shape: (768, 10)


  cleaned_data = data.applymap(lambda x: float(str(x).split(':')[-1]) if isinstance(x, str) and ':' in x else x)


In [4]:
# Remove the last column (column 9) that contains only NaN values
cleaned_data = cleaned_data.drop(columns=[9])

# Verify the result to ensure only the last column has been removed
print(cleaned_data.head())
print(f'Cleaned data shape: {cleaned_data.shape}')  # Should now have 9 columns (1 Label + 8 Features)

   Label         1         2         3         4         5         6  \
0     -1 -0.294118  0.487437  0.180328 -0.292929 -1.000000  0.001490   
1      1 -0.882353 -0.145729  0.081967 -0.414141 -1.000000 -0.207153   
2     -1 -0.058824  0.839196  0.049180 -1.000000 -1.000000 -0.305514   
3      1 -0.882353 -0.105528  0.081967 -0.535354 -0.777778 -0.162444   
4     -1 -1.000000  0.376884 -0.344262 -0.292929 -0.602837  0.284650   

          7         8  
0 -0.531170 -0.033333  
1 -0.766866 -0.666667  
2 -0.492741 -0.633333  
3 -0.923997 -1.000000  
4  0.887276 -0.600000  
Cleaned data shape: (768, 9)


### Splitting the Cleaned Dataset

In [5]:
from sklearn.model_selection import train_test_split

# Separate features (X) and labels (y)
X_cleaned = cleaned_data.iloc[:, 1:]  # Features (all columns except the first one)
y_cleaned = cleaned_data['Label']     # Labels (the first column)

# Split the cleaned data into training and test sets (80% training, 20% test)
X_train_cleaned, X_test_cleaned, y_train_cleaned, y_test_cleaned = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=42)

# Check the dimensions of the cleaned training and test sets
print(f'Cleaned Training set size: {X_train_cleaned.shape}')
print(f'Cleaned Test set size: {X_test_cleaned.shape}')

Cleaned Training set size: (614, 8)
Cleaned Test set size: (154, 8)


## Implementing a Baseline Linear Model

In [6]:
# Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize the logistic regression model
logistic_model = LogisticRegression()

# Train the logistic regression model on the training set
logistic_model.fit(X_train_cleaned, y_train_cleaned)

# Predict on the test set
y_pred_baseline = logistic_model.predict(X_test_cleaned)

# Evaluate the baseline model
accuracy_baseline = accuracy_score(y_test_cleaned, y_pred_baseline)
print(f'Baseline Logistic Regression Accuracy: {accuracy_baseline * 100:.2f}%')

# Display classification report for more detailed metrics
print("Classification Report for Baseline Logistic Regression:")
print(classification_report(y_test_cleaned, y_pred_baseline))

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

### Using SMOTE to Handle Class Imbalance

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Initialize and train the perceptron model on resampled data
perceptron_model_smote = Perceptron(max_iter=1000, eta0=1.0, random_state=42)
perceptron_model_smote.fit(X_resampled, y_resampled)

# Predict on the test set
y_pred_smote = perceptron_model_smote.predict(X_test)

# Evaluate the model
accuracy_smote = accuracy_score(y_test, y_pred_smote)
conf_matrix_smote = confusion_matrix(y_test, y_pred_smote)
class_report_smote = classification_report(y_test, y_pred_smote)

# Display the results
print(f"Accuracy after SMOTE: {accuracy_smote}")
print("Confusion Matrix after SMOTE:")
print(conf_matrix_smote)
print("Classification Report after SMOTE:")
print(class_report_smote)

### Using Class Weights to Handle Class Imbalance

In [None]:
# Initialize and train the perceptron model with class weights
perceptron_model_weighted = Perceptron(max_iter=1000, eta0=1.0, random_state=42, class_weight='balanced')
perceptron_model_weighted.fit(X_train, y_train)

# Predict on the test set
y_pred_weighted = perceptron_model_weighted.predict(X_test)

# Evaluate the model
accuracy_weighted = accuracy_score(y_test, y_pred_weighted)
conf_matrix_weighted = confusion_matrix(y_test, y_pred_weighted)
class_report_weighted = classification_report(y_test, y_pred_weighted)

# Display the results
print(f"Accuracy with Class Weights: {accuracy_weighted}")
print("Confusion Matrix with Class Weights:")
print(conf_matrix_weighted)
print("Classification Report with Class Weights:")
print(class_report_weighted)

### Hyperparameter Tuning