In [1]:
# Importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
#Uploading training data into the dataframe named as df
train_data = pd.read_csv("C:/Users/arora/OneDrive/Desktop/CIND 820/Literature Review/Dataset/Training.csv")
test_data = pd.read_csv("C:/Users/arora/OneDrive/Desktop/CIND 820/Literature Review/Dataset/Testing (1).csv")

In [3]:
#checking the dimensions of the dataset (rows,column)
train_data.shape

(4920, 134)

In [4]:
test_data.shape

(42, 133)

# Data Cleaning

In [5]:
#drop the column which has NaN values in the dataset (Unnamed: 133)
train_data.dropna(how="all",axis=1,inplace=True)

In [6]:
# Check the number of rows before removing duplicates
print("Number of rows before removing duplicates:", len(train_data))

# Remove duplicate rows
train_data.drop_duplicates(inplace=True)

# Check the number of rows after removing duplicates
print("Number of rows after removing duplicates:", len(train_data))

Number of rows before removing duplicates: 4920
Number of rows after removing duplicates: 304


In [7]:
# Check the number of rows before removing duplicates
print("Number of rows before removing duplicates:", len(test_data))

# Remove duplicate rows
test_data.drop_duplicates(inplace=True)

# Check the number of rows after removing duplicates
print("Number of rows after removing duplicates:", len(test_data))

Number of rows before removing duplicates: 42
Number of rows after removing duplicates: 42


# Feature Selection

In [8]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression


# Separate the features (X) and target variable (y)
X = test_data.drop('prognosis', axis=1)  # Replace 'target_column' with the actual column name
y = test_data['prognosis']

# Create an instance of the model you want to use for feature selection
model = LogisticRegression()

# Create an instance of the Recursive Feature Elimination (RFE) selector
selector = RFE(model, n_features_to_select=20)  # Specify the number of features to select

# Fit the selector on the data
selector.fit(X, y)

# Get the selected features
selected_features = X.columns[selector.support_]

# Print the selected features
print("Selected Features:")
print(selected_features)


Selected Features:
Index(['itching', 'skin_rash', 'chills', 'joint_pain', 'vomiting', 'fatigue',
       'lethargy', 'cough', 'high_fever', 'sweating', 'headache',
       'yellowish_skin', 'nausea', 'loss_of_appetite', 'abdominal_pain',
       'diarrhoea', 'yellowing_of_eyes', 'chest_pain', 'excessive_hunger',
       'irritability'],
      dtype='object')


# Feature Importance

In [9]:
from sklearn.ensemble import RandomForestClassifier



# Separate the features (X) and target variable (y)
X = train_data.drop('prognosis', axis=1)  # Replace 'target_column' with the actual column name
y = train_data['prognosis']

# Create an instance of the Random Forest classifier
model = RandomForestClassifier()

# Train the Random Forest model
model.fit(X, y)

# Get feature importances
importances = model.feature_importances_

# Create a DataFrame with feature names and their importances
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})

# Sort the features by importance in descending order
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Print the feature importances
print("Feature Importances:")
print(feature_importances)


Feature Importances:
                 Feature  Importance
97           muscle_pain    0.023552
25            high_fever    0.019990
0                itching    0.019555
11              vomiting    0.019414
43     yellowing_of_eyes    0.019103
..                   ...         ...
126  silver_like_dusting    0.002859
16               anxiety    0.002592
51     throat_irritation    0.002194
70   puffy_face_and_eyes    0.001039
45        fluid_overload    0.000000

[132 rows x 2 columns]


In [10]:
# Split the training data into features (X) and target variable (y)
X_train = train_data.drop('prognosis', axis=1)  # Replace 'target_column' with the actual column name
y_train = train_data['prognosis']

# Split the testing data into features (X) and target variable (y)
X_test = test_data.drop('prognosis', axis=1)  # Replace 'target_column' with the actual column name
y_test = test_data['prognosis']

# Model Building

# Random Forest

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
# Train the Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test)

# Generate the classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print(report)

                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         1
                                   AIDS       1.00      1.00      1.00         1
                                   Acne       1.00      1.00      1.00         1
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         1
                              Arthritis       1.00      1.00      1.00         1
                       Bronchial Asthma       1.00      1.00      1.00         1
                   Cervical spondylosis       1.00      1.00      1.00         1
                            Chicken pox       1.00      1.00      1.00         1
                    Chronic cholestasis       1.00      1.00      1.00         1
                            Common Cold       1.00      1.00      1.00         1
                           

# Matthews Correlation Coefficient

In [12]:
from sklearn.metrics import matthews_corrcoef
# Make predictions on the testing data
predicted_labels = model.predict(X_test)

# Obtain the ground truth labels for the testing data
ground_truth_labels = y_test

# Calculate the Matthews correlation coefficient
mcc = matthews_corrcoef(ground_truth_labels, predicted_labels)

# Print the MCC score
print("Matthews Correlation Coefficient (MCC):", mcc)

Matthews Correlation Coefficient (MCC): 0.9761627906976744


# Logistic Regression

In [13]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
modellog = LogisticRegression()
modellog.fit(X_train, y_train)
y_pred = model.predict(X_test)
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

Accuracy: 0.9761904761904762


In [14]:
from sklearn.metrics import classification_report
# Generate the classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print(report)

                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         1
                                   AIDS       1.00      1.00      1.00         1
                                   Acne       1.00      1.00      1.00         1
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         1
                              Arthritis       1.00      1.00      1.00         1
                       Bronchial Asthma       1.00      1.00      1.00         1
                   Cervical spondylosis       1.00      1.00      1.00         1
                            Chicken pox       0.50      1.00      0.67         1
                    Chronic cholestasis       1.00      1.00      1.00         1
                            Common Cold       1.00      1.00      1.00         1
                           

# Support Vector Classifier

In [15]:
# Support Vector Classifier (SVC) 
from sklearn.svm import SVC
modelsvc = SVC()
modelsvc.fit(X_train, y_train)
y_pred = modelsvc.predict(X_test)
from sklearn.metrics import accuracy_score 
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

Accuracy:  1.0


In [23]:
from sklearn.metrics import classification_report
# Generate the classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print(report)

                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         1
                                   AIDS       1.00      1.00      1.00         1
                                   Acne       1.00      1.00      1.00         1
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         1
                              Arthritis       1.00      1.00      1.00         1
                       Bronchial Asthma       1.00      1.00      1.00         1
                   Cervical spondylosis       1.00      1.00      1.00         1
                            Chicken pox       1.00      1.00      1.00         1
                    Chronic cholestasis       1.00      1.00      1.00         1
                            Common Cold       1.00      1.00      1.00         1
                           

# K-Nearest Neighbors (KNN) classifier

In [17]:
# K-Nearest Neighbors (KNN) classifier
from sklearn.neighbors import KNeighborsClassifier
modelknn = KNeighborsClassifier(n_neighbors=3)
modelknn.fit(X_train, y_train)
y_pred = modelknn.predict(X_test)
from sklearn.metrics import accuracy_score #0.97
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

Accuracy:  1.0


In [18]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
# Train the KNN model
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test)

# Generate the classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print(report)

                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         1
                                   AIDS       1.00      1.00      1.00         1
                                   Acne       1.00      1.00      1.00         1
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         1
                              Arthritis       1.00      1.00      1.00         1
                       Bronchial Asthma       1.00      1.00      1.00         1
                   Cervical spondylosis       1.00      1.00      1.00         1
                            Chicken pox       1.00      1.00      1.00         1
                    Chronic cholestasis       1.00      1.00      1.00         1
                            Common Cold       1.00      1.00      1.00         1
                           

# Gaussian Naive Bayes classifier

In [19]:
# Gaussian Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 1.0


In [20]:
from sklearn.metrics import classification_report
# Generate the classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print(report)

                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         1
                                   AIDS       1.00      1.00      1.00         1
                                   Acne       1.00      1.00      1.00         1
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         1
                              Arthritis       1.00      1.00      1.00         1
                       Bronchial Asthma       1.00      1.00      1.00         1
                   Cervical spondylosis       1.00      1.00      1.00         1
                            Chicken pox       1.00      1.00      1.00         1
                    Chronic cholestasis       1.00      1.00      1.00         1
                            Common Cold       1.00      1.00      1.00         1
                           

# Cross validation (Stratified K-Fold)

In [21]:
# Cross validation
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn import svm


# X represents the features (symptoms) and y represents the target (prognosis)
# Assign the features (symptoms) to X and the target (prognosis) to y
X = train_data.iloc[:, :-1]  # Assuming the features are in columns except the last one
y = train_data.iloc[:, -1]   # Assuming the target variable is in the last column

# Create an instance of the classifier you want to evaluate
clf = svm.SVC(kernel='linear', C=1, random_state=42)

# Create an instance of StratifiedKFold with the desired number of folds
k = 5
skf = StratifiedKFold(n_splits=k)

# Perform stratified k-fold cross-validation
accuracy_scores = cross_val_score(clf, X, y, cv=skf)

# Print the accuracy scores for each fold
for fold, accuracy in enumerate(accuracy_scores):
    print(f"Fold {fold+1}: Accuracy = {accuracy}")

# Print the mean and standard deviation of the accuracy scores
print(f"\nMean Accuracy: {accuracy_scores.mean()}")
print(f"Standard Deviation: {accuracy_scores.std()}")


Fold 1: Accuracy = 1.0
Fold 2: Accuracy = 1.0
Fold 3: Accuracy = 1.0
Fold 4: Accuracy = 1.0
Fold 5: Accuracy = 1.0

Mean Accuracy: 1.0
Standard Deviation: 0.0


# Hyperparameter Tuning

In [22]:
from sklearn.model_selection import GridSearchCV


# Define the model
model = RandomForestClassifier()

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 5, 10],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
best_model = RandomForestClassifier(**best_params)
best_model.fit(X_train, y_train)

# Evaluate the model on the testing set
accuracy = best_model.score(X_test, y_test)

# Print the best hyperparameters and accuracy
print("Best Hyperparameters:", best_params)
print("Accuracy:", accuracy)


Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy: 0.9761904761904762
