In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv("Datasets/cleandata.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'Datasets/cleandata.csv'

In [None]:
df.head()

In [None]:
df.shape

In [None]:
categorical_feature = df.columns[df.dtypes == 'object']
numerical_feature = df.columns[df.dtypes != 'object']

In [None]:
categorical_feature

In [None]:
numerical_feature

In [None]:
for fea in categorical_feature:
    unique = df[fea].unique()
    print(f"{fea} :  {unique}")

In [None]:
df['Drug'].unique()

In [None]:
sex = ["F","M"]
BP = ['LOW','NORMAL','HIGH']
cholestrol = ['NORMAL','HIGH']
Drug = ["drugX","DrugY","drugA","drugB","drugC"]

In [None]:
df['Sex'] = df['Sex'].map({'F' :0,'M':1})
df['BP'] = df['BP'].map({'LOW':0,'NORMAL':1,"HIGH":2})
df["Cholesterol"] = df['Cholesterol'].map({"NORMAL":0,"HIGH":1})
df["Drug"] = df['Drug'].map({'drugX':0,"DrugY":1,"drugA":2,"drugB":3,"drugC":4})

In [None]:
x = df.drop(labels=['Drug'],axis=1)

In [None]:
y = df[['Drug']]

In [None]:
x

In [None]:
y

In [None]:
# Pipelines

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
num_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler()),
    ]
)
cat_pipeline = Pipeline(
    steps = [
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('encoding',OrdinalEncoder(categories=[sex,BP,cholestrol,Drug])),
    ]
)

preprocessor = ColumnTransformer(
    [
        ('num_pipeline',num_pipeline,numerical_feature),
        ('cat_pipeline',cat_pipeline,categorical_feature)
    ]
)

In [None]:
num_pipeline

In [None]:
cat_pipeline

In [None]:
df.head()

In [None]:
df['Drug'].unique()

In [None]:
x.head()

In [None]:
y.head()

# train test split and standarisation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
scalar = StandardScaler()
scalar.fit(x)
x_standard = scalar.fit_transform(x)
x_train,x_test,y_train,y_test = train_test_split(x_standard,y,test_size=0.20,stratify = y,random_state=42)

In [None]:
# for equal number of sample used for training and testing that why we used stratify

In [None]:
x_train.shape, x_test.shape

In [None]:
y_train.shape, y_test.shape

# Testing Datasets

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

score = cross_val_score(DecisionTreeClassifier(),x,y,cv=10)

In [None]:
score

In [None]:
score.mean()

In [None]:
from sklearn.ensemble import BaggingClassifier

bagging_model = BaggingClassifier(
    estimator = DecisionTreeClassifier(),
    n_estimators = 100,
    max_samples = 0.8,
    oob_score = True,
    random_state = 0
)
bagging_model.fit(x_train,y_train)

In [None]:
bagging_model.oob_score_

# Multi Model Traing 

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
# from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier

In [None]:
support_vector = SVC(kernel="sigmoid",gamma=1)
Kneighbars = KNeighborsClassifier()
multi_nominal = MultinomialNB()
decision_tree = DecisionTreeClassifier(max_depth=5, min_samples_split=10, min_samples_leaf=5)
logistic = LogisticRegression(solver = 'liblinear', penalty = 'l1')
random_forest = RandomForestClassifier() 
ada_boost = AdaBoostClassifier(n_estimators = 80, random_state = 42)
bagging = BaggingClassifier(estimator = DecisionTreeClassifier(),n_estimators = 80,max_samples = 0.8,oob_score = True,random_state = 42)
Gradient_boost = GradientBoostingClassifier(n_estimators = 80, random_state = 42)
# xg_boost = XGBClassifier(n_estimators = 100, random_state= 42)

In [None]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
from sklearn.pipeline import make_pipeline
base_learner = [
    ('support_vector', SVC(probability=True)),  # Enable probability for SVC
    ('logistic', LogisticRegression()),
    ('random_forest', RandomForestClassifier()),
    ('Kneighbars',KNeighborsClassifier())   
]
stacking = StackingClassifier(estimators=base_learner, final_estimator=DecisionTreeClassifier())

In [None]:
classification = {
    
    'Support Vector Classifier' : support_vector,
    'K-Neighbors Classifier'  : Kneighbars,
    'Decision Tree Classifier'  : decision_tree,
    'Logistic Regression'  : logistic,
    'Random Forest Classifier'  : random_forest,
    'AdaBoost Classifier': ada_boost,
    'Bagging Classifier' : bagging,
    'Stacking':stacking,
    'Gradient Boosting Classifier' : Gradient_boost,
}

In [None]:

def train_classifier(classification, X_train, y_train, X_test, y_test):
  accuracy_cv = cross_val_score(classification, X_train, y_train, cv=5, scoring='accuracy')
  precision_cv = cross_val_score(classification, X_train, y_train, cv=5, scoring='precision_macro')
  classification.fit(X_train, y_train) 
  y_pred = classification.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred,average="macro")
  matrix = confusion_matrix(y_test, y_pred)
  return accuracy, precision, matrix,accuracy_cv,precision_cv

In [None]:
accuracy_scores = []
precision_scores = []
accuracy_cv_scores = []
precision_cv_scores = []
for name, cls in classification.items():
  curr_accuracy, curr_precision, matrix,accuracy_cv, precision_cv = train_classifier(cls, x_train, y_train, x_test, y_test)
  
  print("Model name : ", name)
  print("Accuracy : ", curr_accuracy)
  print("Precision : ", curr_precision)
  print("Confusin-Matrix : ", matrix)
  print("Accuracy cross validation : ", accuracy_cv)
  print("Precision cross validation : ", precision_cv,"\n")
        
  accuracy_scores.append(curr_accuracy)
  precision_scores.append(curr_precision)
  accuracy_cv_scores.append(accuracy_cv.mean())
  precision_cv_scores.append(precision_cv.mean())

# Overall Result

In [None]:

result_dataframe = pd.DataFrame({'Algorithm': classification.keys(), 'Accuracy': accuracy_scores, 
                                 'Precision' : precision_scores, "Accuracy CV":accuracy_cv_scores,
                                 "Precision CV":precision_cv_scores })

In [None]:
result_dataframe

In [None]:
max_accuracy_idx  = result_dataframe['Accuracy'].idxmax()

In [None]:
max_accuracy_idx

In [None]:
based_moel_name = result_dataframe.loc[max_accuracy_idx,'Algorithm']

In [None]:
based_moel_name

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,confusion_matrix

In [None]:
bagging.fit(x_train, y_train) 
y_pred = bagging.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred,average="macro")
recall = recall_score(y_test, y_pred,average="macro")
F1_score = f1_score(y_test, y_pred,average="macro")

In [None]:
print("Accuracy   :", accuracy)
print("Precision  :", precision)
print("Recall     :", recall)
print("F1-score   :", F1_score)

In [None]:
print(metrics.classification_report(y_test, y_pred))

# confusion matrix
plt.show()

In [None]:
cm = confusion_matrix(y_test,y_pred)

#Plot the confusion matrix.
sns.heatmap(cm,
            annot=True,
            fmt='g',
            xticklabels=['malignant', 'benign'],
            yticklabels=['malignant', 'benign'])
plt.ylabel('Prediction',fontsize=13)
plt.xlabel('Actual',fontsize=13)
plt.title('Confusion Matrix',fontsize=17)
plt.show()

In [None]:
input_data = (47,1,0,1,10.114)
input_data_array = np.asarray(input_data)
input_data_reshaped =  input_data_array.reshape(1,-1)
print(input_data_reshaped)

In [None]:
ml = [
     support_vector,
     Kneighbars,
     decision_tree,
     logistic,
     random_forest,
     ada_boost,
         bagging,
        stacking,
    Gradient_boost,
]
for i in ml:
    re = i.predict(input_data_reshaped)
    print(re)

In [None]:
prediction = stacking.predict(input_data_reshaped)
print(prediction)

In [None]:
if prediction == 0:
    print("DrugX")
elif prediction == 1:
    print("DrugY")
elif prediction == 2:
    print("DrugA")
elif prediction == 3:
    print("DrugB")
else:
    print("DrugC")

In [None]:
# 28	F	NORMAL	HIGH	7.798	drugX
# 61	F	LOW	HIGH	18.043	DrugY
# 47	M	LOW	HIGH	10.114	drugC