#### Pipeline

In [1]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn import svm

from sklearn.model_selection import cross_val_score, StratifiedKFold

In [2]:
def raman_ml_pipeline(X, y):
    '''Pineline с методами  xgboost, logreg, random forest и SVM'''
    X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=.3, random_state=7)
    
    logreg = LogisticRegression(random_state=17, solver='lbfgs', max_iter=1000)
    logreg.fit(X_train, y_train)
    pred_holdout_logreg = logreg.predict(X_holdout)
    logreg_accuracy = accuracy_score(y_holdout, pred_holdout_logreg)
    
    rf_clf = RandomForestClassifier(n_estimators=10, n_jobs=2, random_state=7)
    rf_clf.fit(X_train, y_train)
    pred_holdout_rf_clf = rf_clf.predict(X_holdout)
    rf_accuracy = accuracy_score(y_holdout, pred_holdout_rf_clf)
    
    xgbc = XGBClassifier()
    xgbc.fit(X_train, y_train)
    pred_holdout_xgbc = xgbc.predict(X_holdout)
    XGB_accuracy = accuracy_score(y_holdout, pred_holdout_xgbc)
    
    clf_svm = svm.SVC(gamma='scale')
    clf_svm.fit(X_train, y_train)
    pred_holdout_svm = clf_svm.predict(X_holdout)
    svm_accuracy = accuracy_score(y_holdout, pred_holdout_svm)
    
    accuracy = pd.DataFrame({
    'xgboost': [XGB_accuracy],
    'logreg': [logreg_accuracy],
    'random forest': [rf_accuracy],
    'SVM': [svm_accuracy]
    }, index=['accuracy'])
    
    skf = StratifiedKFold(n_splits = 5, random_state=1, shuffle = True)
    
    svm_cvs = cross_val_score(clf_svm, X, y, scoring='accuracy', cv = skf).mean()
    xgb_cvs = cross_val_score(xgbc, X, y, scoring='accuracy', cv = skf).mean()
    logreg_cvs = cross_val_score(logreg, X, y, scoring='accuracy', cv = skf).mean()
    rf_cvs = cross_val_score(rf_clf, X, y, scoring='accuracy', cv = skf).mean()
    
    accuracy_cvs = pd.DataFrame({
    'xgboost': [xgb_cvs],
    'logreg': [logreg_cvs],
    'random forest': [rf_cvs],
    'SVM': [svm_cvs]
    }, index=['cvs_mean'])
    accuracy = accuracy.append(accuracy_cvs)
    
    return accuracy

In [24]:
df_earLobe = pd.read_csv("datasets/earLobe.csv")
df_innerArm = pd.read_csv("datasets/innerArm.csv")
df_thumbNail = pd.read_csv("datasets/thumbNail.csv")
df_vein = pd.read_csv("datasets/vein.csv")

In [25]:
patientID_earLobe = df_earLobe.pop('patientID')
patientID_innerArm = df_innerArm.pop('patientID')
patientID_thumbNail = df_thumbNail.pop('patientID')
patientID_vein = df_vein.pop('patientID')

In [26]:
df_earLobe.drop(0, inplace=True)
df_innerArm.drop(0, inplace=True)
df_thumbNail.drop(0, inplace=True)
df_vein.drop(0, inplace=True)

In [27]:
X_earLobe, y_earLobe = df_earLobe.drop('has_DM2', axis=1), df_earLobe['has_DM2']
X_innerArm, y_innerArm = df_innerArm.drop('has_DM2', axis=1), df_innerArm['has_DM2']
X_thumbNail, y_thumbNail = df_thumbNail.drop('has_DM2', axis=1), df_thumbNail['has_DM2']
X_vein, y_vein = df_vein.drop('has_DM2', axis=1), df_vein['has_DM2']

In [28]:
raman_ml_pipeline(X_earLobe, y_earLobe)

Unnamed: 0,xgboost,logreg,random forest,SVM
accuracy,0.833333,0.666667,0.666667,0.666667
cvs_mean,0.613333,0.436667,0.536667,0.553333


In [29]:
raman_ml_pipeline(X_innerArm, y_innerArm)

Unnamed: 0,xgboost,logreg,random forest,SVM
accuracy,0.666667,0.833333,0.5,0.333333
cvs_mean,0.436667,0.486667,0.386667,0.543333


In [30]:
raman_ml_pipeline(X_thumbNail, y_thumbNail)

Unnamed: 0,xgboost,logreg,random forest,SVM
accuracy,0.5,0.333333,0.333333,0.5
cvs_mean,0.366667,0.216667,0.476667,0.553333


In [31]:
raman_ml_pipeline(X_vein, y_vein)

Unnamed: 0,xgboost,logreg,random forest,SVM
accuracy,0.666667,0.666667,0.666667,0.666667
cvs_mean,0.623333,0.503333,0.503333,0.593333
