## IMPORTING THE LIBRARIES

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sb

## IMPORTING AND ANALYSIS THE DATASET

In [None]:
trainData = pd.read_csv('Training.csv')
testData = pd.read_csv('Testing.csv')

In [None]:
trainData = trainData.iloc[:, :-1]
train_names = list(trainData.columns)
print(train_names.pop())
print(trainData.info())
print(testData.info())

In [None]:
print(trainData.shape)
print(testData.shape)

In [None]:
trainData.isnull().sum()  #early info of the dataset

In [None]:
from collections import Counter
group_counts = Counter(trainData['prognosis'])
for item in group_counts:
    print(item ,": ", group_counts[item])

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
trainData['prognosis'] = le.fit_transform(trainData['prognosis'])
testData['prognosis'] = le.transform(testData['prognosis'])

## SPLITTING TRAINING SET AND TEST SET

In [None]:
X_train = trainData.drop(['prognosis'], axis=1)
y_train = trainData['prognosis']
X_test = testData.drop(['prognosis'], axis=1)
y_test = testData['prognosis']

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
print(y_test)

# TRAINING MODEL

### Linear Regression

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

### KNN Nearest Neighbours

In [None]:
# from sklearn.neighbors import KNeighborsClassifier
# classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
# classifier.fit(X_train, y_train)

### SVM

In [None]:
# from sklearn.svm import SVC
# classifier = SVC(kernel = 'linear', random_state = 0)
# classifier.fit(X_train, y_train, feature_names = train_names)

### Kernel SVM

In [None]:
# from sklearn.svm import SVC
# classifier = SVC(kernel = 'rbf', random_state = 0)
# classifier.fit(X_train, y_train)

### Naive Bayes

In [None]:
# from sklearn.naive_bayes import GaussianNB
# classifier = GaussianNB()
# classifier.fit(X_train, y_train)

### Decision Tree

In [None]:
# from sklearn.tree import DecisionTreeClassifier
# classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
# classifier.fit(X_train, y_train)

### Random Forest

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
# classifier.fit(X_train, y_train)

### XGBOOST

In [None]:
# from xgboost import XGBClassifier
# classifier = XGBClassifier()
# classifier.fit(X_train, y_train)

## PREDICTING TEST SET

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score

y_pred = classifier.predict(X_test)
for i in range(len(y_pred)):
    print(y_pred[i], ":  ", y_test[i])

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
# Evaluating the classifier 
# printing every score of the classifier 
# scoring in anything 
from sklearn.metrics import classification_report, accuracy_score 
from sklearn.metrics import precision_score, recall_score 
from sklearn.metrics import f1_score, matthews_corrcoef 
from sklearn.metrics import confusion_matrix 
 
n_errors = (y_pred != y_test).sum() 
print(f"No of incorrect: {n_errors}")

acc = accuracy_score(y_test, y_pred) 
print("The accuracy is {}".format(acc)) 

prec = precision_score(y_test, y_pred, average='micro') 
print("The precision is {}".format(prec))  #predicted fraud was actually fraud

rec = recall_score(y_test, y_pred, average='micro')         #actual fraud was predicted fraud
print("The recall is {}".format(rec)) 

f1 = f1_score(y_test, y_pred, average='micro') 
print("The F1-Score is {}".format(f1)) 

MCC = matthews_corrcoef(y_test, y_pred) 
print("The Matthews correlation coefficient is {}".format(MCC)) 

In [None]:
LABELS = [item for item in group_counts]
conf_matrix = confusion_matrix(y_test, y_pred) 
plt.figure(figsize =(10, 10)) 
sb.heatmap(conf_matrix, xticklabels = LABELS,  
            yticklabels = LABELS, annot = True, fmt ="d"); 
plt.title("Confusion matrix") 
plt.ylabel('True class') 
plt.xlabel('Predicted class') 
plt.show() 

## Applying k-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 3)
print("Max accuracy: {:.5f} %".format(max(accuracies)*100))
print("Min accuracy: {:.5f} %".format(min(accuracies)*100))
print("Accuracy: {:.5f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.5f} %".format(accuracies.std()*100))

## PREDICTING SINGLE VALUE

In [None]:
print(train_names)

In [None]:
print()
l1 = ["lack_of_concentration", "watering_from_eyes"]
l2 = []
l = []
i = 0
j = 0
for i in range(len(train_names)):
    if j == len(l1):
        l2.append(0)
    elif l1[j] == train_names[i]:
        l2.append(1)
        j += 1
    else:
        l2.append(0)
    i += 1
l.append(l2)
print("Symtops: ", l1)
print("Predicted Disease: ")
print(le.inverse_transform((classifier.predict(l))))