# 다범주 분류 분석 

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [None]:
iris = pd.read_csv("iris.csv")
iris

In [None]:
sns.scatterplot(data=iris, x='Sepal_Length', y='Petal_Length', hue='Species', palette="Set2")
plt.show()

In [None]:
X = iris.drop('Species',axis = 1)
y = iris['Species']

## 로지스틱 회귀분석

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(multi_class='ovr') # ovr = one-vs-rest
model.fit(X, y)
yhat = model.predict(X)
yhat

## SVM

In [None]:
# linear kernel 이용 
from sklearn.svm import SVC
model_svm = SVC(kernel='linear', decision_function_shape='ovr')
linear_svm= model_svm.fit(X, y)
linear_svm.predict(X)

# 불균형 데이터 처리방법

In [None]:
#!pip install imbalanced-learn

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter    # class 갯수 확인 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('product.csv')
print(df.shape)
print(df.isnull().sum().sum())
X = df.drop('Defect', axis=1)
y = df['Defect']
y = np.where(y=="NG", 1, 0)
xvar = X.columns
xvar

In [None]:
counter = Counter(y)
print(counter)

## Train/Test 분할

In [None]:
from sklearn.model_selection import train_test_split
def data_split (x, y) : 
    X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.5, random_state=1234, stratify=y)
    print("Ratio of the event: Training dataset:",round( sum(Y_train)/len(Y_train)*100,2),"%, Test dataset:",
          round( sum(Y_test)/len(Y_test)*100,2), "%") 
    return X_train, X_test, Y_train, Y_test

In [None]:
X_train, X_test, y_train, y_test=data_split (X, y)

In [None]:
counter_train = Counter(y_train) 
counter_test = Counter(y_test)
print("train: ", counter_train, ",   test:", counter_test)

### SMOTE

In [None]:
# transform the dataset
smote = SMOTE(random_state=0)
X_smote, y_smote = smote.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_smote)
print(counter)

#### 의사결정나무

In [None]:
from sklearn.tree import DecisionTreeClassifier
model_tree = DecisionTreeClassifier() 
original_tree = model_tree.fit(X_train, y_train)
model_tree_s = DecisionTreeClassifier() 
smote_tree = model_tree_s.fit(X_smote, y_smote)

#### SVM

In [None]:
from sklearn.svm import SVC
# linear kernel 이용 
model_svm = SVC(kernel='linear', probability=True, random_state=0)
original_svm = model_svm.fit(X_train, y_train)
model_svm_s = SVC(kernel='linear', probability=True, random_state=0)
smote_svm = model_svm_s.fit(X_smote, y_smote)

## ROC curve 와 AUC

In [None]:
from sklearn.metrics import roc_curve 
from sklearn.metrics import roc_auc_score 
# sensitivity & specificity
fpr1, tpr1, th1 = roc_curve(y_test, model_tree.predict_proba(X_test)[:,1])
fpr2, tpr2, th2 = roc_curve(y_test, model_svm.predict_proba(X_test)[:,1])
fpr3, tpr3, th3 = roc_curve(y_test, model_tree_s.predict_proba(X_test)[:,1])
fpr4, tpr4, th4 = roc_curve(y_test, model_svm_s.predict_proba(X_test)[:,1])
# AUROC
roc_auc1 = roc_auc_score(y_test, model_tree.predict_proba(X_test)[:,1] )
roc_auc2 = roc_auc_score(y_test, model_svm.predict_proba(X_test)[:,1] )
roc_auc3 = roc_auc_score(y_test, model_tree_s.predict_proba(X_test)[:,1] )
roc_auc4 = roc_auc_score(y_test, model_svm_s.predict_proba(X_test)[:,1] )
# Curve plotting
plt.figure()
plt.plot(fpr1, tpr1, label='Tree (area = %0.2f)' % roc_auc1)
plt.plot(fpr2, tpr2, label='SVM (area = %0.2f)' % roc_auc2)
plt.plot(fpr3, tpr3, label='Tree Smote (area = %0.2f)' % roc_auc3)
plt.plot(fpr4, tpr4, label='SVM Smote (area = %0.2f)' % roc_auc3)
plt.plot([0, 1], [0, 1],'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()