In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
# Importing necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [3]:
df=pd.read_excel('Appendicitis1.xlsx')

In [4]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Assume 'X' contains your features and 'y' contains your target variable
X = df.drop(columns=['Management', 'Severity', 'Diagnosis','BMI','Length_of_Stay','Diagnosis_Presumptive','Alvarado_Score','Paedriatic_Appendicitis_Score'])  # Features
y = df['Diagnosis']  # Target variable

# Convert categorical variables to numerical using one-hot encoding or label encoding
# Make sure all features are categorical for chi-square test
X = pd.get_dummies(X)  # One-hot encoding for simplicity, replace with appropriate encoding method

# Select top k features using chi-square test
k = 10  # Number of features to select
chi2_selector = SelectKBest(chi2, k=k)
X_kbest = chi2_selector.fit_transform(X, y)

# Get the indices of the selected features
selected_indices = chi2_selector.get_support(indices=True)

# Get the names of the selected features
selected_features = X.columns[selected_indices]

# Print the names of the selected features
print("Selected features:")
print(selected_features)

Selected features:
Index(['Sex', 'Weight', 'Migratory_Pain', 'Contralateral_Rebound_Tenderness',
       'Nausea', 'Loss_of_Appetite', 'WBC_Count', 'Neutrophil_Percentage',
       'Neutrophilia', 'CRP'],
      dtype='object')


In [5]:
# Create a new DataFrame with selected features
df1 = df.loc[:, selected_features].copy()

# Add 'Age' and 'Height' columns to the selected DataFrame
df1['Lower_Right_Abd_Pain'] = df['Lower_Right_Abd_Pain']
df1['Body_Temperature'] = df['Body_Temperature']
df1['Age'] = df['Age']
df1['Height'] = df['Height']
df1['Diagnosis'] = df['Diagnosis']

# Display the first few rows of the new DataFrame
print(df1.head())


   Sex  Weight  Migratory_Pain  Contralateral_Rebound_Tenderness  Nausea  \
0    0    37.0               0                                 1       0   
1    1    69.5               1                                 1       0   
2    0    62.0               0                                 1       0   
3    0    56.0               1                                 0       1   
4    0    45.0               0                                 1       1   

   Loss_of_Appetite  WBC_Count  Neutrophil_Percentage  Neutrophilia  CRP  \
0                 1       7700                   68.2             0  0.0   
1                 1       8100                   64.8             0  3.0   
2                 0      13200                   74.8             0  3.0   
3                 1      11400                   63.0             0  0.0   
4                 1       8100                   44.0             0  0.0   

   Lower_Right_Abd_Pain  Body_Temperature  Age  Height  Diagnosis  
0                 

In [6]:
X_resampled=X
y_resampled=y

In [7]:
from sklearn.linear_model import LogisticRegression
# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

# Classification report
print(classification_report(y_test, y_pred))


Accuracy: 70.74829931972789
              precision    recall  f1-score   support

           0       0.68      0.85      0.75        78
           1       0.76      0.55      0.64        69

    accuracy                           0.71       147
   macro avg       0.72      0.70      0.70       147
weighted avg       0.72      0.71      0.70       147



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[66 12]
 [31 38]]


In [9]:
from sklearn.svm import SVC
# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize SVM model
svm_model = SVC(kernel='linear', random_state=42)  # Linear kernel for linear SVM

# Train the model
svm_model.fit(X_train, y_train)

# Predictions on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.6802721088435374
              precision    recall  f1-score   support

           0       0.66      0.83      0.73        78
           1       0.73      0.51      0.60        69

    accuracy                           0.68       147
   macro avg       0.69      0.67      0.67       147
weighted avg       0.69      0.68      0.67       147



In [10]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[65 13]
 [34 35]]


In [11]:
from sklearn.naive_bayes import GaussianNB

# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize Naive Bayes model
naive_bayes_model = GaussianNB()

# Train the model
naive_bayes_model.fit(X_train, y_train)

# Predictions on the test set
y_pred = naive_bayes_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

# Classification report
print(classification_report(y_test, y_pred))


Accuracy: 74.14965986394559
              precision    recall  f1-score   support

           0       0.78      0.72      0.75        78
           1       0.71      0.77      0.74        69

    accuracy                           0.74       147
   macro avg       0.74      0.74      0.74       147
weighted avg       0.74      0.74      0.74       147



In [12]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[56 22]
 [16 53]]


In [13]:
from sklearn.ensemble import GradientBoostingClassifier
# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=0)

# Initialize Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)

# Train the model
gb_model.fit(X_train, y_train)

# Predictions on the test set
y_pred = gb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

# Classification report
print(classification_report(y_test, y_pred))

Accuracy: 75.51020408163265
              precision    recall  f1-score   support

           0       0.76      0.85      0.80        84
           1       0.75      0.63      0.69        63

    accuracy                           0.76       147
   macro avg       0.76      0.74      0.74       147
weighted avg       0.76      0.76      0.75       147



In [14]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[71 13]
 [23 40]]


In [15]:
from sklearn.neighbors import KNeighborsClassifier

# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, 
                                                test_size=0.2, random_state=42)

# Initialize KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)  

# Train the model
knn_model.fit(X_train, y_train)

# Predictions on the test set
y_pred = knn_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

# Classification report
print(classification_report(y_test, y_pred))


Accuracy: 65.3061224489796
              precision    recall  f1-score   support

           0       0.64      0.81      0.71        78
           1       0.69      0.48      0.56        69

    accuracy                           0.65       147
   macro avg       0.66      0.64      0.64       147
weighted avg       0.66      0.65      0.64       147



In [16]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[63 15]
 [36 33]]


In [17]:
from catboost import CatBoostClassifier
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,
                                                    test_size=0.2, random_state=0)

# Initialize CatBoost model
catboost_model = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1,
                                    random_state=42)

# Train the model
catboost_model.fit(X_train, y_train)  # Assuming 'Sex' is a categorical feature

# Predictions on the test set
y_pred = catboost_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)


0:	learn: 0.6575583	total: 153ms	remaining: 2m 33s
1:	learn: 0.6329301	total: 165ms	remaining: 1m 22s
2:	learn: 0.6084609	total: 175ms	remaining: 58.3s
3:	learn: 0.5839499	total: 182ms	remaining: 45.4s
4:	learn: 0.5675471	total: 189ms	remaining: 37.6s
5:	learn: 0.5563059	total: 195ms	remaining: 32.4s
6:	learn: 0.5409333	total: 202ms	remaining: 28.6s
7:	learn: 0.5259026	total: 213ms	remaining: 26.4s
8:	learn: 0.5156608	total: 220ms	remaining: 24.2s
9:	learn: 0.5057049	total: 227ms	remaining: 22.5s
10:	learn: 0.4962197	total: 234ms	remaining: 21s
11:	learn: 0.4881067	total: 241ms	remaining: 19.8s
12:	learn: 0.4796864	total: 247ms	remaining: 18.8s
13:	learn: 0.4689543	total: 255ms	remaining: 17.9s
14:	learn: 0.4597312	total: 261ms	remaining: 17.1s
15:	learn: 0.4532660	total: 268ms	remaining: 16.5s
16:	learn: 0.4488978	total: 275ms	remaining: 15.9s
17:	learn: 0.4395756	total: 281ms	remaining: 15.3s
18:	learn: 0.4332939	total: 288ms	remaining: 14.9s
19:	learn: 0.4266154	total: 295ms	remaini

In [18]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[68 16]
 [27 36]]


In [19]:
from sklearn.ensemble import AdaBoostClassifier

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize the AdaBoost classifier
adaboost = AdaBoostClassifier(n_estimators=50, random_state=42)

# Train the model
adaboost.fit(X_train, y_train)

# Predictions on the test set
y_pred = adaboost.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

# Classification report
print(classification_report(y_test, y_pred))

Accuracy: 72.10884353741497
              precision    recall  f1-score   support

           0       0.71      0.81      0.75        78
           1       0.74      0.62      0.68        69

    accuracy                           0.72       147
   macro avg       0.72      0.72      0.72       147
weighted avg       0.72      0.72      0.72       147



In [20]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[63 15]
 [26 43]]


In [21]:
from sklearn.tree import DecisionTreeClassifier
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,
                                                test_size=0.2, random_state=42)

# Initialize the Decision Tree classifier
decision_tree = DecisionTreeClassifier(random_state=42)

# Train the model
decision_tree.fit(X_train, y_train)

# Predictions on the test set
y_pred = decision_tree.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Decision Tree Accuracy:", accuracy*100)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Decision Tree Accuracy: 64.62585034013605
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.67      0.67        78
           1       0.62      0.62      0.62        69

    accuracy                           0.65       147
   macro avg       0.64      0.64      0.64       147
weighted avg       0.65      0.65      0.65       147



In [22]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[52 26]
 [26 43]]


In [23]:
from sklearn.ensemble import RandomForestClassifier

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, 
                                                test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
random_forest.fit(X_train, y_train)

# Predictions on the test set
y_pred = random_forest.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Random Forest Accuracy:", accuracy*100)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Random Forest Accuracy: 70.06802721088435
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.82      0.74        78
           1       0.74      0.57      0.64        69

    accuracy                           0.70       147
   macro avg       0.71      0.69      0.69       147
weighted avg       0.71      0.70      0.69       147



In [24]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[64 14]
 [30 39]]


In [25]:
from xgboost import XGBClassifier
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,
                                            test_size=0.2, random_state=42)

# Initialize XGBoost classifier
model = XGBClassifier()

# Train the classifier
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

# Classification report
print(classification_report(y_test, y_pred))


Accuracy: 66.66666666666666
              precision    recall  f1-score   support

           0       0.66      0.78      0.71        78
           1       0.69      0.54      0.60        69

    accuracy                           0.67       147
   macro avg       0.67      0.66      0.66       147
weighted avg       0.67      0.67      0.66       147



In [26]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[61 17]
 [32 37]]


In [27]:
import lightgbm as lgb

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, 
                                                    test_size=0.2, random_state=42)

# Define LightGBM parameters
params = {
    'objective': 'multiclass',
    'num_class': len(y.unique()),  # Number of classes
    'metric': 'multi_error'  # Error rate for multiclass classification
}

# Convert dataset to LightGBM format
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

# Train the LightGBM model
model = lgb.train(params, train_set=train_data, num_boost_round=100, valid_sets=[test_data])

# Make predictions
y_pred = model.predict(X_test)
y_pred_class = np.argmax(y_pred, axis=1)  # Get the index of the maximum value along the row axis

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_class)
print("Accuracy:", accuracy*100)

# Classification report
print(classification_report(y_test, y_pred_class))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000327 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 989
[LightGBM] [Info] Number of data points in the train set: 584, number of used features: 26
[LightGBM] [Info] Start training from score -0.509115
[LightGBM] [Info] Start training from score -0.918863
Accuracy: 70.74829931972789
              precision    recall  f1-score   support

           0       0.68      0.83      0.75        78
           1       0.75      0.57      0.64        69

    accuracy                           0.71       147
   macro avg       0.72      0.70      0.70       147
weighted avg       0.72      0.71      0.70       147



In [28]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred_class)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[65 13]
 [30 39]]
