In [1]:
# Importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
#Uploading training data into the dataframe named as df
train_data = pd.read_csv("C:/Users/arora/OneDrive/Desktop/CIND 820/Literature Review/Dataset/Training.csv")
test_data = pd.read_csv("C:/Users/arora/OneDrive/Desktop/CIND 820/Literature Review/Dataset/Testing (1).csv")

In [3]:
#checking the dimensions of the dataset (rows,column)
train_data.shape

(4920, 134)

In [4]:
test_data.shape

(42, 133)

# Data Cleaning

In [5]:
#drop the column which has NaN values in the dataset (Unnamed: 133)
train_data.dropna(how="all",axis=1,inplace=True)

In [6]:
#checking the length of columns
len(train_data.columns)

133

# Feature Selection

In [7]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression


# Separate the features (X) and target variable (y)
X = test_data.drop('prognosis', axis=1)  
y = test_data['prognosis']

# Create an instance of the model you want to use for feature selection
model = LogisticRegression()

# Create an instance of the Recursive Feature Elimination (RFE) selector
selector = RFE(model, n_features_to_select=20)  

# Fit the selector on the data
selector.fit(X, y)

# Get the selected features
selected_features = X.columns[selector.support_]

# Print the selected features
print("Selected Features:")
print(selected_features)


Selected Features:
Index(['itching', 'skin_rash', 'chills', 'joint_pain', 'vomiting', 'fatigue',
       'lethargy', 'cough', 'high_fever', 'sweating', 'headache',
       'yellowish_skin', 'nausea', 'loss_of_appetite', 'abdominal_pain',
       'diarrhoea', 'yellowing_of_eyes', 'chest_pain', 'excessive_hunger',
       'irritability'],
      dtype='object')


# Feature Importance

In [9]:
from sklearn.ensemble import RandomForestClassifier

# Separate the features (X) and target variable (y)
X = train_data.drop('prognosis', axis=1)  
y = train_data['prognosis']

# Create an instance of the Random Forest classifier
model = RandomForestClassifier()

# Train the Random Forest model
model.fit(X, y)

# Get feature importances
importances = model.feature_importances_

# Create a DataFrame with feature names and their importances
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})

# Sort the features by importance in descending order
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Print the feature importances
print("Feature Importances:")
print(feature_importances)

Feature Importances:
                   Feature  Importance
97             muscle_pain    0.020256
0                  itching    0.016930
106         family_history    0.014914
43       yellowing_of_eyes    0.014585
98       altered_sensorium    0.014428
..                     ...         ...
69   swollen_blood_vessels    0.002070
15             weight_gain    0.001532
55              congestion    0.001302
70     puffy_face_and_eyes    0.001025
45          fluid_overload    0.000000

[132 rows x 2 columns]


# Train Test Split

In [10]:
# Split the training data into features (X) and target variable (y)
X_train = train_data.drop('prognosis', axis=1)  
y_train = train_data['prognosis']

# Split the testing data into features (X) and target variable (y)
X_test = test_data.drop('prognosis', axis=1)  
y_test = test_data['prognosis']

#  Matthews Correlation Coefficient

In [11]:
from sklearn.metrics import matthews_corrcoef
# Make predictions on the testing data
predicted_labels = model.predict(X_test)

# Obtain the ground truth labels for the testing data
ground_truth_labels = y_test

# Calculate the Matthews correlation coefficient
mcc = matthews_corrcoef(ground_truth_labels, predicted_labels)

# Print the MCC score
print("Matthews Correlation Coefficient (MCC):", mcc)

Matthews Correlation Coefficient (MCC): 0.9761627906976744


# Random Forest Classifier

In [12]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=1000, random_state=42,n_jobs=-1)
model.fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)

In [14]:
from sklearn.metrics import classification_report

# Classification Model Validation
Confusion_Mat = pd.crosstab(y_test, y_pred)
print(Confusion_Mat) # R, C format (Actual = testY, Predicted = Test_Pred)

# Validation on Testset
print(classification_report(y_test, y_pred)) # Actual, Predicted

col_0                                    (vertigo) Paroymsal  Positional Vertigo  \
prognosis                                                                          
(vertigo) Paroymsal  Positional Vertigo                                        1   
AIDS                                                                           0   
Acne                                                                           0   
Alcoholic hepatitis                                                            0   
Allergy                                                                        0   
Arthritis                                                                      0   
Bronchial Asthma                                                               0   
Cervical spondylosis                                                           0   
Chicken pox                                                                    0   
Chronic cholestasis                                                         

# Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression
modellog = LogisticRegression()
modellog.fit(X_train, y_train)
y_pred = model.predict(X_test)
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

Accuracy: 0.9761904761904762


# Support Vector Classifier (SVC)

In [16]:
from sklearn.svm import SVC
modelsvc = SVC()
modelsvc.fit(X_train, y_train)
y_pred = modelsvc.predict(X_test)
from sklearn.metrics import accuracy_score 
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

Accuracy:  1.0


# K-Nearest Neighbors (KNN)

In [17]:
from sklearn.neighbors import KNeighborsClassifier
modelknn = KNeighborsClassifier(n_neighbors=3)
modelknn.fit(X_train, y_train)
y_pred = modelknn.predict(X_test)
from sklearn.metrics import accuracy_score #0.97
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

Accuracy:  1.0


In [18]:
X_test.shape

(42, 132)

# Gaussian Naive Bayes

In [19]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

In [20]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 1.0


# Cross validation

In [21]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn import svm


# X represents the features (symptoms) and y represents the target (prognosis)
# Assign the features (symptoms) to X and the target (prognosis) to y
X = train_data.iloc[:, :-1]  # Assuming the features are in columns except the last one
y = train_data.iloc[:, -1]   # Assuming the target variable is in the last column

# Create an instance of the classifier you want to evaluate
clf = svm.SVC(kernel='linear', C=1, random_state=42)

# Create an instance of StratifiedKFold with the desired number of folds
k = 5
skf = StratifiedKFold(n_splits=k)

# Perform stratified k-fold cross-validation
accuracy_scores = cross_val_score(clf, X, y, cv=skf)

# Print the accuracy scores for each fold
for fold, accuracy in enumerate(accuracy_scores):
    print(f"Fold {fold+1}: Accuracy = {accuracy}")

# Print the mean and standard deviation of the accuracy scores
print(f"\nMean Accuracy: {accuracy_scores.mean()}")
print(f"Standard Deviation: {accuracy_scores.std()}")


Fold 1: Accuracy = 1.0
Fold 2: Accuracy = 1.0
Fold 3: Accuracy = 1.0
Fold 4: Accuracy = 1.0
Fold 5: Accuracy = 1.0

Mean Accuracy: 1.0
Standard Deviation: 0.0


# Hyperparameter Tuning

In [21]:

from sklearn.model_selection import GridSearchCV


# Define the model
model = RandomForestClassifier()

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 5, 10],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
best_model = RandomForestClassifier(**best_params)
best_model.fit(X_train, y_train)

# Evaluate the model on the testing set
accuracy = best_model.score(X_test, y_test)

# Print the best hyperparameters and accuracy
print("Best Hyperparameters:", best_params)
print("Accuracy:", accuracy)


Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy: 0.9761904761904762
