In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix

In [3]:
df=pd.read_excel('Appendicitis1.xlsx')

In [4]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Assume 'X' contains your features and 'y' contains your target variable
X = df.drop(columns=['Management', 'Severity', 'Diagnosis','BMI','Length_of_Stay',
                     'Diagnosis_Presumptive','Alvarado_Score','Paedriatic_Appendicitis_Score'])  # Features
y = df['Diagnosis']  # Target variable
X = pd.get_dummies(X)  

# Select top k features using chi-square test
k = 10  # Number of features to select
chi2_selector = SelectKBest(chi2, k=k)
X_kbest = chi2_selector.fit_transform(X, y)

# Get the indices of the selected features
selected_indices = chi2_selector.get_support(indices=True)
# Get the names of the selected features
selected_features = X.columns[selected_indices]

# Print the names of the selected features
print("Selected features:")
print(selected_features)

Selected features:
Index(['Sex', 'Weight', 'Migratory_Pain', 'Contralateral_Rebound_Tenderness',
       'Nausea', 'Loss_of_Appetite', 'WBC_Count', 'Neutrophil_Percentage',
       'Neutrophilia', 'CRP'],
      dtype='object')


In [5]:
# Create a new DataFrame with selected features
df1 = df.loc[:, selected_features].copy()

# Add 'Age' and 'Height' columns to the selected DataFrame
df1['Lower_Right_Abd_Pain'] = df['Lower_Right_Abd_Pain']
df1['Body_Temperature'] = df['Body_Temperature']
df1['Age'] = df['Age']
df1['Height'] = df['Height']
df1['Diagnosis'] = df['Diagnosis']

# Display the first few rows of the new DataFrame
print(df1.head())


   Sex  Weight  Migratory_Pain  Contralateral_Rebound_Tenderness  Nausea  \
0    0    37.0               0                                 1       0   
1    1    69.5               1                                 1       0   
2    0    62.0               0                                 1       0   
3    0    56.0               1                                 0       1   
4    0    45.0               0                                 1       1   

   Loss_of_Appetite  WBC_Count  Neutrophil_Percentage  Neutrophilia  CRP  \
0                 1       7700                   68.2             0  0.0   
1                 1       8100                   64.8             0  3.0   
2                 0      13200                   74.8             0  3.0   
3                 1      11400                   63.0             0  0.0   
4                 1       8100                   44.0             0  0.0   

   Lower_Right_Abd_Pain  Body_Temperature  Age  Height  Diagnosis  
0                 

In [6]:
'''from imblearn.over_sampling import SMOTE
import pandas as pd

# Assume 'X' contains your features and 'y' contains your target variable
X = df2.drop(columns=['Diagnosis'])  # Features
y = df2['Diagnosis']  # Target variable

# Initialize SMOTE
smote = SMOTE(random_state=0)

# Perform SMOTE oversampling
X_resampled, y_resampled = smote.fit_resample(X, y)

# Print the shape of the resampled data
print("Shape of X_resampled:", X_resampled.shape)
print("Shape of y_resampled:", y_resampled.shape)'''

'from imblearn.over_sampling import SMOTE\nimport pandas as pd\n\n# Assume \'X\' contains your features and \'y\' contains your target variable\nX = df2.drop(columns=[\'Diagnosis\'])  # Features\ny = df2[\'Diagnosis\']  # Target variable\n\n# Initialize SMOTE\nsmote = SMOTE(random_state=0)\n\n# Perform SMOTE oversampling\nX_resampled, y_resampled = smote.fit_resample(X, y)\n\n# Print the shape of the resampled data\nprint("Shape of X_resampled:", X_resampled.shape)\nprint("Shape of y_resampled:", y_resampled.shape)'

In [7]:
# Count occurrences of class 0 (non-appendicitis)
count_non_appendicitis = sum(y == 1)

# Count occurrences of class 1 (appendicitis)
count_appendicitis = sum(y == 0)

print("Number of non-appendicitis samples:", count_non_appendicitis)
print("Number of appendicitis samples:", count_appendicitis)

Number of non-appendicitis samples: 302
Number of appendicitis samples: 429


In [7]:
from imblearn.over_sampling import SMOTE

X = df1.drop(columns=['Diagnosis'])  
y = df1['Diagnosis']  

# Count the number of samples in each class
non_appendicitis_count = np.sum(y == 1)
appendicitis_count = np.sum(y == 0)
# Desired number of rows
desired_rows = 1100

# Calculate the number of synthetic samples needed for each class
non_appendicitis_synthetic = max(desired_rows - non_appendicitis_count, 0)
appendicitis_synthetic = max(desired_rows - appendicitis_count, 0)

# Apply SMOTE with the adjusted sampling strategy
smote = SMOTE(sampling_strategy={1: non_appendicitis_synthetic, 0: appendicitis_synthetic},
              random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Check the shape of the resampled dataset
print("Shape of X_resampled:", X_resampled.shape)
print("Shape of y_resampled:", y_resampled.shape)

Shape of X_resampled: (1469, 14)
Shape of y_resampled: (1469,)


In [8]:
# Count occurrences of class 0 (non-appendicitis)
count_non_appendicitis = sum(y_resampled == 1)

# Count occurrences of class 1 (appendicitis)
count_appendicitis = sum(y_resampled == 0)

print("Number of non-appendicitis samples:", count_non_appendicitis)
print("Number of appendicitis samples:", count_appendicitis)

Number of non-appendicitis samples: 798
Number of appendicitis samples: 671


In [9]:
from sklearn.linear_model import LogisticRegression
# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled,
                            y_resampled, test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

# Classification report
print(classification_report(y_test, y_pred))


Accuracy: 72.10884353741497
              precision    recall  f1-score   support

           0       0.65      0.70      0.67       121
           1       0.78      0.73      0.76       173

    accuracy                           0.72       294
   macro avg       0.71      0.72      0.72       294
weighted avg       0.73      0.72      0.72       294



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[ 85  36]
 [ 46 127]]


In [11]:
from sklearn.svm import SVC
# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize SVM model
svm_model = SVC(kernel='linear', random_state=42)  # Linear kernel for linear SVM

# Train the model
svm_model.fit(X_train, y_train)

# Predictions on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.7448979591836735
              precision    recall  f1-score   support

           0       0.67      0.74      0.70       121
           1       0.80      0.75      0.78       173

    accuracy                           0.74       294
   macro avg       0.74      0.74      0.74       294
weighted avg       0.75      0.74      0.75       294



In [12]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[ 89  32]
 [ 43 130]]


In [14]:
from sklearn.naive_bayes import GaussianNB

# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize Naive Bayes model
naive_bayes_model = GaussianNB()

# Train the model
naive_bayes_model.fit(X_train, y_train)

# Predictions on the test set
y_pred = naive_bayes_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

# Classification report
print(classification_report(y_test, y_pred))


Accuracy: 73.12925170068027
              precision    recall  f1-score   support

           0       0.69      0.63      0.66       121
           1       0.76      0.80      0.78       173

    accuracy                           0.73       294
   macro avg       0.72      0.72      0.72       294
weighted avg       0.73      0.73      0.73       294



In [15]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[ 76  45]
 [ 34 139]]


In [16]:
from sklearn.ensemble import GradientBoostingClassifier
# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=0)

# Initialize Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)

# Train the model
gb_model.fit(X_train, y_train)

# Predictions on the test set
y_pred = gb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

# Classification report
print(classification_report(y_test, y_pred))

Accuracy: 76.87074829931973
              precision    recall  f1-score   support

           0       0.74      0.73      0.73       128
           1       0.79      0.80      0.80       166

    accuracy                           0.77       294
   macro avg       0.76      0.76      0.76       294
weighted avg       0.77      0.77      0.77       294



In [17]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[ 93  35]
 [ 33 133]]


In [18]:
from sklearn.neighbors import KNeighborsClassifier

# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, 
                                                test_size=0.2, random_state=42)

# Initialize KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)  
# Train the model
knn_model.fit(X_train, y_train)
# Predictions on the test set
y_pred = knn_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

# Classification report
print(classification_report(y_test, y_pred))


Accuracy: 78.2312925170068
              precision    recall  f1-score   support

           0       0.74      0.72      0.73       121
           1       0.81      0.83      0.82       173

    accuracy                           0.78       294
   macro avg       0.78      0.77      0.77       294
weighted avg       0.78      0.78      0.78       294



In [19]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[ 87  34]
 [ 30 143]]


In [20]:
from catboost import CatBoostClassifier
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,
                                                    test_size=0.2, random_state=0)

# Initialize CatBoost model
catboost_model = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1,
                                    random_state=42)

# Train the model
catboost_model.fit(X_train, y_train)  # Assuming 'Sex' is a categorical feature

# Predictions on the test set
y_pred = catboost_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)


0:	learn: 0.6539504	total: 601ms	remaining: 10m
1:	learn: 0.6108811	total: 614ms	remaining: 5m 6s
2:	learn: 0.5830414	total: 636ms	remaining: 3m 31s
3:	learn: 0.5601214	total: 664ms	remaining: 2m 45s
4:	learn: 0.5410271	total: 688ms	remaining: 2m 16s
5:	learn: 0.5237624	total: 768ms	remaining: 2m 7s
6:	learn: 0.5077534	total: 786ms	remaining: 1m 51s
7:	learn: 0.4953984	total: 868ms	remaining: 1m 47s
8:	learn: 0.4830125	total: 892ms	remaining: 1m 38s
9:	learn: 0.4724396	total: 931ms	remaining: 1m 32s
10:	learn: 0.4645508	total: 966ms	remaining: 1m 26s
11:	learn: 0.4566939	total: 988ms	remaining: 1m 21s
12:	learn: 0.4488684	total: 1.02s	remaining: 1m 17s
13:	learn: 0.4413223	total: 1.03s	remaining: 1m 12s
14:	learn: 0.4345914	total: 1.04s	remaining: 1m 8s
15:	learn: 0.4262999	total: 1.05s	remaining: 1m 4s
16:	learn: 0.4208980	total: 1.07s	remaining: 1m 1s
17:	learn: 0.4142568	total: 1.08s	remaining: 59.1s
18:	learn: 0.4102758	total: 1.11s	remaining: 57.5s
19:	learn: 0.4052570	total: 1.14

In [21]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[ 99  29]
 [ 24 142]]


In [22]:
from sklearn.ensemble import AdaBoostClassifier

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize the AdaBoost classifier
adaboost = AdaBoostClassifier(n_estimators=50, random_state=42)

# Train the model
adaboost.fit(X_train, y_train)

# Predictions on the test set
y_pred = adaboost.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

# Classification report
print(classification_report(y_test, y_pred))

Accuracy: 75.51020408163265
              precision    recall  f1-score   support

           0       0.70      0.71      0.70       121
           1       0.80      0.79      0.79       173

    accuracy                           0.76       294
   macro avg       0.75      0.75      0.75       294
weighted avg       0.76      0.76      0.76       294



In [23]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[ 86  35]
 [ 37 136]]


In [24]:
from sklearn.tree import DecisionTreeClassifier
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,
                                                test_size=0.2, random_state=42)

# Initialize the Decision Tree classifier
decision_tree = DecisionTreeClassifier(random_state=42)
# Train the model
decision_tree.fit(X_train, y_train)
# Predictions on the test set
y_pred = decision_tree.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Decision Tree Accuracy:", accuracy*100)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Decision Tree Accuracy: 78.91156462585033
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.78      0.75       121
           1       0.84      0.80      0.82       173

    accuracy                           0.79       294
   macro avg       0.78      0.79      0.78       294
weighted avg       0.79      0.79      0.79       294



In [25]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[ 94  27]
 [ 35 138]]


In [26]:
from sklearn.ensemble import RandomForestClassifier
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, 
                                                test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
random_forest.fit(X_train, y_train)
# Predictions on the test set
y_pred = random_forest.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Random Forest Accuracy:", accuracy*100)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Random Forest Accuracy: 81.63265306122449
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.79      0.78       121
           1       0.85      0.84      0.84       173

    accuracy                           0.82       294
   macro avg       0.81      0.81      0.81       294
weighted avg       0.82      0.82      0.82       294



In [27]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[ 95  26]
 [ 28 145]]


In [28]:
from xgboost import XGBClassifier
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,
                                            test_size=0.2, random_state=42)

# Initialize XGBoost classifier
model = XGBClassifier()
# Train the classifier
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)
# Classification report
print(classification_report(y_test, y_pred))


Accuracy: 85.37414965986395
              precision    recall  f1-score   support

           0       0.83      0.81      0.82       121
           1       0.87      0.88      0.88       173

    accuracy                           0.85       294
   macro avg       0.85      0.85      0.85       294
weighted avg       0.85      0.85      0.85       294



In [29]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[ 98  23]
 [ 20 153]]


In [30]:
import lightgbm as lgb
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, 
                                                    test_size=0.2, random_state=42)
# Define LightGBM parameters
params = {
    'objective': 'multiclass',
    'num_class': len(y.unique()),  # Number of classes
    'metric': 'multi_error'  # Error rate for multiclass classification
}
# Convert dataset to LightGBM format
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

# Train the LightGBM model
model = lgb.train(params, train_set=train_data, num_boost_round=100, valid_sets=[test_data])
# Make predictions
y_pred = model.predict(X_test)
y_pred_class = np.argmax(y_pred, axis=1)  # Get the index of the maximum value along the row axis

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_class)
print("Accuracy:", accuracy*100)

# Classification report
print(classification_report(y_test, y_pred_class))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001069 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1375
[LightGBM] [Info] Number of data points in the train set: 1175, number of used features: 14
[LightGBM] [Info] Start training from score -0.759105
[LightGBM] [Info] Start training from score -0.631272
Accuracy: 86.39455782312925
              precision    recall  f1-score   support

           0       0.86      0.80      0.83       121
           1       0.87      0.91      0.89       173

    accuracy                           0.86       294
   macro avg       0.86      0.85      0.86       294
weighted avg       0.86      0.86      0.86       294



In [31]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred_class)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[ 97  24]
 [ 16 157]]
