# 분류 분석 데이터 (유방암 양성/악성 예측)

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [None]:
df = pd.read_csv('wdbc.csv')
df.shape

In [None]:
df.isnull().sum().sum()

In [None]:
X = df.drop(['diagnosis','ID'], axis=1)
y = df['diagnosis']
y = np.where(y=="M", 1, 0)
xvar = X.columns

## Random Forest 방법을 이용한 feature selection

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=0, n_estimators=100)
clf = clf.fit(X, y)
clf.feature_importances_

In [None]:
coef = pd.DataFrame()
coef["Features"] = X.columns
coef["Importance"] = clf.feature_importances_
coef.sort_values('Importance', ascending=False)

In [None]:
xvar1=X.columns[coef['Importance']>0.07]
X1 = X[xvar1]

## SVM

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve 
from sklearn.metrics import roc_auc_score 
def data_split (x, y) : 
    X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.5, random_state=1234, stratify=y)
    print("Ratio of the event: Training dataset-",round( sum(Y_train)/len(Y_train)*100,2),"%, Test dataset -",
          round( sum(Y_test)/len(Y_test)*100,2), "%") 
    return X_train, X_test, Y_train, Y_test

#### Train / Test data 생성

In [None]:
X_train, X_test, y_train, y_test=data_split (X1, y)

#### SVM fitting

In [None]:
# linear kernel 이용 
model_svm = SVC(kernel='linear', probability=True, random_state=0)
linear_svm= model_svm.fit(X_train, y_train)
accuracy_score(y_test, linear_svm.predict(X_test))

In [None]:
# rbf kernel 이용 
model_svm = SVC(kernel='rbf', probability=True, random_state=0)
rbf_svm= model_svm.fit(X_train, y_train)
accuracy_score(y_test, rbf_svm.predict(X_test))

In [None]:
# polynomial kernel 이용 
model_svm = SVC(kernel='poly', max_iter=1000, probability=True, random_state=0) ## slow
poly_svm= model_svm.fit(X_train, y_train)
accuracy_score(y_test, poly_svm.predict(X_test))

#### ROC curve 와 AUC

In [None]:
# sensitivity & specificity
fpr1, tpr1, th1 = roc_curve(y_test, linear_svm.predict_proba(X_test)[:,1])
fpr2, tpr2, th2 = roc_curve(y_test, rbf_svm.predict_proba(X_test)[:,1])
fpr3, tpr3, th3 = roc_curve(y_test, poly_svm.predict_proba(X_test)[:,1])
# AUROC
roc_auc1 = roc_auc_score(y_test, linear_svm.predict_proba(X_test)[:,1] )
roc_auc2 = roc_auc_score(y_test, rbf_svm.predict_proba(X_test)[:,1] )
roc_auc3 = roc_auc_score(y_test, poly_svm.predict_proba(X_test)[:,1] )
# Curve plotting
plt.figure()
plt.plot(fpr1, tpr1, label='Linear SVM (area = %0.2f)' % roc_auc1)
plt.plot(fpr2, tpr2, label='RBF SVM (area = %0.2f)' % roc_auc2)
plt.plot(fpr3, tpr3, label='Poly SVM (area = %0.2f)' % roc_auc3)
plt.plot([0, 1], [0, 1],'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()