In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import matplotlib.pyplot as plt


from sklearn.preprocessing import LabelEncoder


from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.svm import SVC


from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.feature_selection import VarianceThreshold

In [None]:
df=pd.read_csv('dermatology.csv')
df

In [None]:
df.isna().sum()

In [None]:
df.age.values

In [None]:
from sklearn.impute import SimpleImputer
imp= SimpleImputer(missing_values=np.nan, strategy='most_frequent')
age_imp=pd.DataFrame(imp.fit_transform(np.array(df.age).reshape(-1,1)))

In [None]:
df.age=age_imp

In [None]:
df.isna().sum()

In [None]:
df.groupby('class').count()

In [None]:
fig, ax = plt.subplots(figsize=(30,30))     

sns.heatmap(df.corr() , annot=True, cmap=plt.cm.PuBu,ax=ax)
plt.show()

In [None]:
x=df.drop(columns=['class'])
y=df['class']
x.shape

In [None]:
from sklearn.feature_selection import VarianceThreshold
select=VarianceThreshold(threshold=0)
x=select.fit_transform(x)
x.shape

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y= le.fit_transform(y)

train test split

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y= le.fit_transform(y)

In [None]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=15)

model num1 :XGBoost model

In [None]:
from sklearn.model_selection import GridSearchCV
PARAMETERS = {
              "max_depth":[ 5, 10],
              "min_child_weight":[2,10],
              "learning_rate":[0.01, 0.1],
              "n_estimators":[100,200]}
xgb_model = xgb.XGBClassifier()
model_gs = GridSearchCV(xgb_model,param_grid=PARAMETERS,cv=3,scoring="accuracy")
model_gs.fit(xtrain,ytrain)

print(model_gs.best_params_)

In [None]:
xgb_model = xgb.XGBClassifier(learning_rate= 0.1, max_depth= 5, min_child_weight=2,n_estimators= 100)
xgb_model.fit(xtrain, ytrain)

In [None]:
from sklearn.metrics import accuracy_score,classification_report
ypred = xgb_model.predict(xtest)
print(accuracy_score(ytest, ypred))
print(classification_report(ytest, ypred))

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
X_train_reduced = pca.fit_transform(xtrain)
X_test_reduced = pca.transform(xtest)
xgb_model.fit(X_train_reduced, ytrain)
ypred_pca= xgb_model.predict(X_test_reduced)
print(accuracy_score(ytest, ypred_pca))
print(classification_report(ytest, ypred_pca))

PCA on this dataset and with XGBoost model has bad affect and reduce the accuracy


grid search:

In [None]:
classifier = [
             Perceptron(),
             RandomForestClassifier(),
             LogisticRegression(),
             KNeighborsClassifier(),
             SGDClassifier()]
perceptron_param_grid={'penalty':['l2','elasticnet']}

rf_param_grid = {"max_features": [1,10],
                "min_samples_split":[50],
                "bootstrap":[True],
                "n_estimators":[100,300]}


logreg_param_grid = {"C":np.logspace(-4, 4, 20),
                    "penalty": ["l1","l2","none"]}

knn_param_grid = {"n_neighbors": np.linspace(2,20,12, dtype = int).tolist(),
                 "weights": ["uniform","distance"],
                 "metric":["euclidean","manhattan","minkowski"],
                 "leaf_size": [1,10,30]}

sgdc_param_grid = {
    "loss" : ["log"],
    "alpha" : [0.0001, 0.01, 0.1],
    "penalty" : ["l2", "l1", "none"]}



classifier_param = [perceptron_param_grid,
               
                   rf_param_grid,
                   logreg_param_grid,
                   knn_param_grid,
                   sgdc_param_grid]

cv_result = []
best_estimators = []
mean_squared_errors = []
roc_auc_scores = []
recall_scores = []
precision_scores = []
f1_scores = []


for i in range(len(classifier)):
    print("---------------------------------------------------------------------------")
    clf = GridSearchCV(classifier[i],
                       param_grid=classifier_param[i],
                       cv = StratifiedKFold(n_splits = 5),
                       scoring = "accuracy",
                       n_jobs = -1,verbose = 2)
    
    clf.fit(xtrain,ytrain)
    
    cv_result.append(clf.best_score_)
    
    mean_squared_errors.append(mean_squared_error(ytest,clf.predict(xtest)))
    
    recall_scores.append(recall_score(ytest, clf.predict(xtest), average='weighted'))
    
    precision_scores.append(precision_score(ytest, clf.predict(xtest), average='weighted'))
    
    f1_scores.append(f1_score(ytest, clf.predict(xtest), average='weighted'))
    
    best_estimators.append(clf.best_estimator_)
    
    print("Model: {}".format(classifier[i]))
    print("Accuracy: %{}".format(round(cv_result[i]*100,2)))
    print("MSE: {}".format(mean_squared_errors[i]))
    print("Recall: {}".format(recall_scores[i]))
    print("Precision: {}".format(precision_scores[i]))
    print("F1-Score: {}".format(f1_scores[i]))
    print("Best Estimator: {}".format(clf.best_estimator_))
    
print("---------------------------------------------------------------------------")

sns.set_style("darkgrid")
cv_results = pd.DataFrame({"Accuracy":cv_result,
                           "MSE":mean_squared_errors,
                           "Recall": recall_scores,
                           "Precision": precision_scores,
                           "F1-Score":f1_scores,
                           "Models":[
                                    'Perceptron',
                                   "RandomForestClassifier",
                                     "LogisticRegression",
                                     "KNeighborsClassifier",
                                     "SGDClassifier"
                                   ]})

cv_results.index = cv_results["Models"]

cv_results  = cv_results.drop(["Models"], axis = 1)

f,ax = plt.subplots(figsize=(14,10))

sns.heatmap(cv_results, annot=True,cmap = "Blues",fmt= '.3f',
            ax=ax,linewidths = 5, cbar = False,
            annot_kws={"size": 18})

plt.xticks(size = 18)
plt.yticks(size = 18, rotation = 0)
plt.ylabel("Models")
plt.title("Grid Search Results", size = 16)
plt.show()

In [None]:
best_estimators

grid search with PCA

In [None]:
classifier = [
             Perceptron(),
             RandomForestClassifier(),
             LogisticRegression(),
             KNeighborsClassifier(),
             SGDClassifier()]
perceptron_param_grid={'penalty':['l2','elasticnet']}

rf_param_grid = {"max_features": [1,10],
                "min_samples_split":[50],
                "bootstrap":[True],
                "n_estimators":[100,300]}


logreg_param_grid = {"C":np.logspace(-4, 4, 20),
                    "penalty": ["l1","l2","none"]}

knn_param_grid = {"n_neighbors": np.linspace(2,20,12, dtype = int).tolist(),
                 "weights": ["uniform","distance"],
                 "metric":["euclidean","manhattan","minkowski"],
                 "leaf_size": [1,10,30]}

sgdc_param_grid = {
    "loss" : ["log"],
    "alpha" : [0.0001, 0.01, 0.1],
    "penalty" : ["l2", "l1", "none"]}



classifier_param = [perceptron_param_grid,
               
                   rf_param_grid,
                   logreg_param_grid,
                   knn_param_grid,
                   sgdc_param_grid]
cv_result = []
best_estimators = []
mean_squared_errors = []
roc_auc_scores = []
recall_scores = []
precision_scores = []
f1_scores = []


for i in range(len(classifier)):
    print("---------------------------------------------------------------------------")
    clf = GridSearchCV(classifier[i],
                       param_grid=classifier_param[i],
                       cv = StratifiedKFold(n_splits = 5),
                       scoring = "accuracy",
                       n_jobs = -1,verbose = 2)
    
    clf.fit(X_train_reduced,ytrain)
    
    cv_result.append(clf.best_score_)
    
    mean_squared_errors.append(mean_squared_error(ytest,clf.predict(X_test_reduced)))
    
    recall_scores.append(recall_score(ytest, clf.predict(X_test_reduced), average='weighted'))
    
    precision_scores.append(precision_score(ytest, clf.predict(X_test_reduced), average='weighted'))
    
    f1_scores.append(f1_score(ytest, clf.predict(X_test_reduced), average='weighted'))
    
    best_estimators.append(clf.best_estimator_)
    
    print("Model: {}".format(classifier[i]))
    print("Accuracy: %{}".format(round(cv_result[i]*100,2)))
    print("MSE: {}".format(mean_squared_errors[i]))
    print("Recall: {}".format(recall_scores[i]))
    print("Precision: {}".format(precision_scores[i]))
    print("F1-Score: {}".format(f1_scores[i]))
    print("Best Estimator: {}".format(clf.best_estimator_))

print("---------------------------------------------------------------------------")

sns.set_style("darkgrid")
cv_results = pd.DataFrame({"Accuracy":cv_result,
                           "MSE":mean_squared_errors,
                           "Recall": recall_scores,
                           "Precision": precision_scores,
                           "F1-Score":f1_scores,
                           "Models":[
                                    'Perceptron',
                                   "RandomForestClassifier",
                                     "LogisticRegression",
                                     "KNeighborsClassifier",
                                     "SGDClassifier"
                                   ]})

cv_results.index = cv_results["Models"]

cv_results  = cv_results.drop(["Models"], axis = 1)

f,ax = plt.subplots(figsize=(14,10))

sns.heatmap(cv_results, annot=True,cmap = "Blues",fmt= '.3f',
            ax=ax,linewidths = 5, cbar = False,
            annot_kws={"size": 18})

plt.xticks(size = 18)
plt.yticks(size = 18, rotation = 0)
plt.ylabel("Models")
plt.title("Grid Search Results with feature selection with PCA", size = 16)
plt.show()

In [None]:
best_estimators

the highest accuracy for this datset is for logisticReggression algorithm with 98% truth
and PCA is not good at all

In [None]:
train_sizes, train_scores, valid_scores = learning_curve(
   LogisticRegression(C=0.23357214690901212), xtrain, ytrain, train_sizes=np.linspace(0.01, 1.0, 40), cv=5,
    scoring="neg_root_mean_squared_error"
)

train_errors = -train_scores.mean(axis=1)
valid_errors = -valid_scores.mean(axis=1)

plt.plot(train_sizes, train_errors, "r-+", linewidth=2, label="train")
plt.plot(train_sizes, valid_errors, "b-", linewidth=3, label="valid")

it shows tha test error is reduce by increasing the number of instances so our model is perform good and it is not overfitting.

In [None]:
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=3,min_samples_split=10), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5
    )

ada_clf.fit(xtrain, ytrain)

In [None]:
y_pred_ada= ada_clf.predict(xtest)
accuracy_score(ytest, y_pred_ada)

 run votting classifier algorithm on models those train with dimensionality reduction(PCA) instance
cause low accuracy 

In [None]:
clf1=Perceptron(penalty='elasticnet')
clf2=RandomForestClassifier(max_features=1, min_samples_split=50)
clf3=LogisticRegression(C=10000.0)
clf4=KNeighborsClassifier(leaf_size=1, metric='manhattan', n_neighbors=3)
clf5=SGDClassifier(alpha=0.1, loss='log', penalty='none')

In [None]:
vc=VotingClassifier(
    estimators=(('per', clf1), ('rf', clf2), ('lr', clf3),('knn',clf4),('sgd',clf5))
    ,voting='hard', n_jobs=-1)

vc.fit(X_train_reduced,ytrain)

In [None]:
y_pred_vc= vc.predict(X_test_reduced)
accuracy_score(ytest, y_pred_vc)

In [None]:
from sklearn.ensemble import StackingClassifier
s=StackingClassifier(estimators=[('per', clf1), ('rf', clf2), ('lr', clf3),('knn',clf4),('sgd',clf5)])

s.fit(X_train_reduced,ytrain)

In [None]:
y_pred_s= s.predict(X_test_reduced)
accuracy_score(ytest, y_pred_s)