#  분류 분석 데이터 (불량/정상 제품)

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [None]:
df = pd.read_csv('product.csv')
print(df.shape)
print(df.isnull().sum().sum())

In [None]:
X = df.drop('Defect', axis=1)
y = df['Defect']
y = np.where(y=="NG", 1, 0)
xvar = X.columns
xvar

## Train/Test 분할

In [None]:
from sklearn.model_selection import train_test_split
def data_split (x, y) : 
    X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.5, random_state=1234, stratify=y)
    print("Ratio of the event: Training dataset-",round( sum(Y_train)/len(Y_train)*100,2),"%, Test dataset -",
          round( sum(Y_test)/len(Y_test)*100,2), "%") 
    return X_train, X_test, Y_train, Y_test

In [None]:
X_train, X_test, y_train, y_test=data_split (X, y)

## Bagging 방법

In [None]:
from sklearn.ensemble import BaggingClassifier
model_bag= BaggingClassifier(n_estimators=100, random_state=0)
result_bag = model_bag.fit(X_train, y_train)
result_bag.predict_proba(X_test)

## Random Forest 방법

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(n_estimators=100, random_state=0)
result_rf = model_rf.fit(X_train, y_train)
result_rf.predict_proba(X_test)

In [None]:
# max_features 를 log2(#xvar)로 변경
model_rf2 = RandomForestClassifier(n_estimators=100, random_state=0, max_features="log2")
result_rf2 = model_rf2.fit(X_train, y_train)
result_rf2.predict_proba(X_test)

## AdaBoost 방법

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
model_ada= AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), 
                              n_estimators=100, random_state=0)
result_ada = model_ada.fit(X_train, y_train)
result_ada.predict_proba(X_test)

## Gradient Boosting 방법

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
model_gb = GradientBoostingClassifier(n_estimators=100, random_state=0)
result_gb = model_gb.fit(X_train, y_train)
result_gb.predict_proba(X_test)

## SVM

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import roc_curve 
from sklearn.metrics import roc_auc_score 
# linear kernel 이용 
model_svm = SVC(kernel='linear', probability=True, random_state=0)
linear_svm= model_svm.fit(X_train, y_train)

In [None]:
# rbf kernel 이용 
model_svm = SVC(kernel='rbf', probability=True, random_state=0)
rbf_svm= model_svm.fit(X_train, y_train)

## Accuracy Score 비교

In [None]:
from sklearn.metrics import accuracy_score
acc_bag = accuracy_score(y_test, result_bag.predict(X_test))
acc_rf = accuracy_score(y_test, result_rf.predict(X_test))
acc_rf2 = accuracy_score(y_test, result_rf2.predict(X_test))
acc_ada = accuracy_score(y_test, result_ada.predict(X_test))
acc_gb = accuracy_score(y_test, result_gb.predict(X_test))
acc_linearsvm = accuracy_score(y_test, linear_svm.predict(X_test))
acc_rbfsvm = accuracy_score(y_test, rbf_svm.predict(X_test))
accuracy = pd.DataFrame()
accuracy["Methods"] = ['Bagging','RandomForest','RandomForest2','AdaBoost','GradientBoosting',
                       'Linear SVM','RBF SVM']
accuracy["Accuracy"] = [acc_bag, acc_rf, acc_rf2, acc_ada, acc_gb, acc_linearsvm, acc_rbfsvm]
accuracy.sort_values('Accuracy', ascending=False)

## ROC curve 와 AUC

In [None]:
# sensitivity & specificity
fpr1, tpr1, th1 = roc_curve(y_test, result_bag.predict_proba(X_test)[:,1])
fpr2, tpr2, th2 = roc_curve(y_test, result_rf.predict_proba(X_test)[:,1])
fpr3, tpr3, th3 = roc_curve(y_test, result_rf2.predict_proba(X_test)[:,1])
fpr4, tpr4, th4 = roc_curve(y_test, result_ada.predict_proba(X_test)[:,1])
fpr5, tpr5, th5 = roc_curve(y_test, result_gb.predict_proba(X_test)[:,1])
fpr6, tpr6, th6 = roc_curve(y_test, linear_svm.predict_proba(X_test)[:,1])
fpr7, tpr7, th7 = roc_curve(y_test, rbf_svm.predict_proba(X_test)[:,1])
# AUROC
roc_auc1 = roc_auc_score(y_test, result_bag.predict_proba(X_test)[:,1] )
roc_auc2 = roc_auc_score(y_test, result_rf.predict_proba(X_test)[:,1] )
roc_auc3 = roc_auc_score(y_test, result_rf2.predict_proba(X_test)[:,1] )
roc_auc4 = roc_auc_score(y_test, result_ada.predict_proba(X_test)[:,1] )
roc_auc5 = roc_auc_score(y_test, result_gb.predict_proba(X_test)[:,1] )
roc_auc6 = roc_auc_score(y_test, linear_svm.predict_proba(X_test)[:,1] )
roc_auc7 = roc_auc_score(y_test, rbf_svm.predict_proba(X_test)[:,1] )
# Curve plotting
plt.figure()
plt.plot(fpr1, tpr1, label='Bagging (area = %0.2f)' % roc_auc1)
plt.plot(fpr2, tpr2, label='RF (area = %0.2f)' % roc_auc2)
plt.plot(fpr3, tpr3, label='RF2 (area = %0.2f)' % roc_auc3)
plt.plot(fpr4, tpr4, label='AbaBoost (area = %0.2f)' % roc_auc3)
plt.plot(fpr5, tpr5, label='GB (area = %0.2f)' % roc_auc2)
plt.plot(fpr6, tpr6, label='SVM linear (area = %0.2f)' % roc_auc3)
plt.plot(fpr7, tpr7, label='SVM rbf (area = %0.2f)' % roc_auc3)
plt.plot([0, 1], [0, 1],'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

# SMOTE 의 효과

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter    # class 갯수 확인 

In [None]:
# transform the dataset
smote = SMOTE(random_state=0)
X_smote, y_smote = smote.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_train)
print(counter)
counter = Counter(y_smote)
print(counter)

## Bagging 방법

In [None]:
from sklearn.ensemble import BaggingClassifier
model_bag= BaggingClassifier(n_estimators=100, random_state=0)
result_bag = model_bag.fit(X_smote, y_smote)
result_bag.predict_proba(X_test)

## Random Forest 방법

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(n_estimators=100, random_state=0)
result_rf = model_rf.fit(X_smote, y_smote)
result_rf.predict_proba(X_test)

In [None]:
# max_features 를 log2(#xvar)로 변경
model_rf2 = RandomForestClassifier(n_estimators=100, random_state=0, max_features="log2")
result_rf2 = model_rf2.fit(X_smote, y_smote)
result_rf2.predict_proba(X_test)

## AdaBoost 방법

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
model_ada= AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), 
                              n_estimators=100, random_state=0)
result_ada = model_ada.fit(X_smote, y_smote)
result_ada.predict_proba(X_test)

## Gradient Boosting 방법

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
model_gb = GradientBoostingClassifier(n_estimators=100, random_state=0)
result_gb = model_gb.fit(X_smote, y_smote)
result_gb.predict_proba(X_test)

## SVM

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import roc_curve 
from sklearn.metrics import roc_auc_score 
# linear kernel 이용 
model_svm = SVC(kernel='linear', probability=True, random_state=0)
linear_svm= model_svm.fit(X_smote, y_smote)

In [None]:
# rbf kernel 이용 
model_svm = SVC(kernel='rbf', probability=True, random_state=0)
rbf_svm= model_svm.fit(X_smote, y_smote)

## Accuracy Score 비교

In [None]:
from sklearn.metrics import accuracy_score
acc_bag = accuracy_score(y_test, result_bag.predict(X_test))
acc_rf = accuracy_score(y_test, result_rf.predict(X_test))
acc_rf2 = accuracy_score(y_test, result_rf2.predict(X_test))
acc_ada = accuracy_score(y_test, result_ada.predict(X_test))
acc_gb = accuracy_score(y_test, result_gb.predict(X_test))
acc_linearsvm = accuracy_score(y_test, linear_svm.predict(X_test))
acc_rbfsvm = accuracy_score(y_test, rbf_svm.predict(X_test))
accuracy = pd.DataFrame()
accuracy["Methods"] = ['Bagging','RandomForest','RandomForest2','AdaBoost','GradientBoosting',
                       'Linear SVM','RBF SVM']
accuracy["Accuracy"] = [acc_bag, acc_rf, acc_rf2, acc_ada, acc_gb, acc_linearsvm, acc_rbfsvm]
accuracy.sort_values('Accuracy', ascending=False)

## ROC curve 와 AUC

In [None]:
# sensitivity & specificity
fpr1, tpr1, th1 = roc_curve(y_test, result_bag.predict_proba(X_test)[:,1])
fpr2, tpr2, th2 = roc_curve(y_test, result_rf.predict_proba(X_test)[:,1])
fpr3, tpr3, th3 = roc_curve(y_test, result_rf2.predict_proba(X_test)[:,1])
fpr4, tpr4, th4 = roc_curve(y_test, result_ada.predict_proba(X_test)[:,1])
fpr5, tpr5, th5 = roc_curve(y_test, result_gb.predict_proba(X_test)[:,1])
fpr6, tpr6, th6 = roc_curve(y_test, linear_svm.predict_proba(X_test)[:,1])
fpr7, tpr7, th7 = roc_curve(y_test, rbf_svm.predict_proba(X_test)[:,1])
# AUROC
roc_auc1 = roc_auc_score(y_test, result_bag.predict_proba(X_test)[:,1] )
roc_auc2 = roc_auc_score(y_test, result_rf.predict_proba(X_test)[:,1] )
roc_auc3 = roc_auc_score(y_test, result_rf2.predict_proba(X_test)[:,1] )
roc_auc4 = roc_auc_score(y_test, result_ada.predict_proba(X_test)[:,1] )
roc_auc5 = roc_auc_score(y_test, result_gb.predict_proba(X_test)[:,1] )
roc_auc6 = roc_auc_score(y_test, linear_svm.predict_proba(X_test)[:,1] )
roc_auc7 = roc_auc_score(y_test, rbf_svm.predict_proba(X_test)[:,1] )
# Curve plotting
plt.figure()
plt.plot(fpr1, tpr1, label='Bagging (area = %0.2f)' % roc_auc1)
plt.plot(fpr2, tpr2, label='RF (area = %0.2f)' % roc_auc2)
plt.plot(fpr3, tpr3, label='RF2 (area = %0.2f)' % roc_auc3)
plt.plot(fpr4, tpr4, label='AbaBoost (area = %0.2f)' % roc_auc3)
plt.plot(fpr5, tpr5, label='GB (area = %0.2f)' % roc_auc2)
plt.plot(fpr6, tpr6, label='SVM linear (area = %0.2f)' % roc_auc3)
plt.plot(fpr7, tpr7, label='SVM rbf (area = %0.2f)' % roc_auc3)
plt.plot([0, 1], [0, 1],'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()