In [4]:
import pandas as pd
import numpy
import matplotlib.pyplot as plt
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [7]:
df_train = pd.read_csv('git_extraction/DF_Radiomics_noduls_with_diagnose_train_data.csv')
df_test = pd.read_csv('git_extraction/DF_Radiomics_noduls_with_diagnose_test_data.csv')

df_train.head()

Unnamed: 0,Patient,Node,Labels,diagnostics_Image-original_Mean,diagnostics_Image-original_Minimum,diagnostics_Image-original_Maximum,diagnostics_Mask-original_VoxelNum,diagnostics_Mask-original_VolumeNum,original_firstorder_10Percentile,original_firstorder_90Percentile,...,diagnostics_Mask-original_BoundingBox_2,diagnostics_Mask-original_BoundingBox_3,diagnostics_Mask-original_BoundingBox_4,diagnostics_Mask-original_BoundingBox_5,diagnostics_Mask-original_CenterOfMassIndex_0,diagnostics_Mask-original_CenterOfMassIndex_1,diagnostics_Mask-original_CenterOfMassIndex_2,diagnostics_Mask-original_CenterOfMass_0,diagnostics_Mask-original_CenterOfMass_1,diagnostics_Mask-original_CenterOfMass_2
0,LIDC-IDRI-0137,Node_N1,3,-671.885608,-2048.0,3071.0,26,1,175.5,850.5,...,30,4,6,2,332.692308,389.538462,30.307692,53.215868,83.626926,-321.730769
1,LIDC-IDRI-0377,Node_N1,2,-882.321409,-3024.0,3071.0,2402,1,-307.0,61.0,...,169,29,24,9,382.402998,308.854288,173.03955,92.739302,28.898399,-68.460564
2,LIDC-IDRI-0167,Node_N1,1,-664.766231,-2048.0,3071.0,56,1,-444.5,-66.5,...,50,6,9,2,70.267857,174.964286,50.321429,-136.23778,-53.812866,-234.696429
3,LIDC-IDRI-0272,Node_N1,3,-824.358062,-2048.0,3071.0,51,1,-447.0,102.0,...,81,6,7,2,209.313725,390.941176,81.568627,-47.673652,80.722794,-109.078431
4,LIDC-IDRI-0234,Node_N1,1,-708.012378,-2048.0,3029.0,251,1,-569.0,82.0,...,41,11,14,3,367.756972,310.848606,41.689243,65.179121,43.765426,-236.276892


In [26]:


# split features and labels
X_train = df_train.drop(['Patient', 'Node', 'Labels'], axis=1)
y_train = df_train['Labels']

X_test = df_test.drop(['Patient', 'Node', 'Labels'], axis=1)
y_test = df_test['Labels']

# Create and fit the logistic regression model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)

# Predictions
y_pred = logistic_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)


#use the classification report to evaluate the model

print(classification_report(y_test, y_pred))




              precision    recall  f1-score   support

           1       0.50      0.10      0.17        20
           2       0.27      0.19      0.22        16
           3       0.49      0.88      0.63        26

    accuracy                           0.45        62
   macro avg       0.42      0.39      0.34        62
weighted avg       0.44      0.45      0.38        62



In [28]:
# Apply PCA
pca = PCA(n_components=8)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)


# Create and fit the logistic regression model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train_pca, y_train)

# Predictions
y_pred = logistic_model.predict(X_test_pca)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

#use the classification report to evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           1       0.50      0.10      0.17        20
           2       0.00      0.00      0.00        16
           3       0.47      1.00      0.64        26

    accuracy                           0.45        62
   macro avg       0.32      0.37      0.27        62
weighted avg       0.36      0.45      0.32        62



In [29]:
# Standardize the data before PCA
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

best_n_components = 0
best_accuracy = 0
best_classification_report = None

# Iterate over different numbers of PCA components
for n_components in range(1, X_train_scaled.shape[1] + 1):
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    logistic_model = LogisticRegression(max_iter=1000)
    logistic_model.fit(X_train_pca, y_train)

    y_pred = logistic_model.predict(X_test_pca)

    accuracy = accuracy_score(y_test, y_pred)
    if accuracy > best_accuracy:
        best_n_components = n_components
        best_accuracy = accuracy
        best_classification_report = classification_report(y_test, y_pred)

print(f"Best Number of PCA Components: {best_n_components}")
print(f"Best Accuracy: {best_accuracy}")
print("Best Classification Report:\n", best_classification_report)

Best Number of PCA Components: 40
Best Accuracy: 0.6451612903225806
Best Classification Report:
               precision    recall  f1-score   support

           1       0.67      0.40      0.50        20
           2       0.69      0.69      0.69        16
           3       0.62      0.81      0.70        26

    accuracy                           0.65        62
   macro avg       0.66      0.63      0.63        62
weighted avg       0.65      0.65      0.63        62

