In [1]:
from sklearn.datasets import load_digits, load_wine, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# 모델 패키지 로드
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import SGDClassifier, LogisticRegression

In [20]:
def data_loader(dataset):
    if dataset == 'digits':
        data = load_digits()
        
    elif dataset == 'wine':
        data = load_wine()
        
    elif dataset == 'cancer':
        data = load_breast_cancer()
        
    print(data.DESCR, '\n')
    print("Dataset: ", dataset)
    print("Target names: ", data.target_names)
     
    return data.data, data.target
   

def data_split(dataset, test_size=0.2, random_state=46):
    data, label = data_loader(dataset)
    X_train, X_test, y_train, y_test = train_test_split(data, 
                                                        label, 
                                                        test_size=test_size, 
                                                        random_state=random_state)
    return X_train, X_test, y_train, y_test

def select_model(name):
    if name == 'DecisionTreeClassifier':
        model = DecisionTreeClassifier()
        
    elif name == 'RandomForestClassifier':
        model = RandomForestClassifier(random_state=46)
        
    elif name == 'svm':
        model = svm.SVC()
        
    elif name == 'SGDClassifier':
        model = SGDClassifier()
    
    elif name == 'LogisticRegression':
        model = LogisticRegression()
    
    print(name, ' is selected.')
    return model

def fit_n_pred(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print('accuracy: ', accuracy_score(y_test, y_pred), '\n')
    print(classification_report(y_test, y_pred))
    

In [21]:
if __name__ == '__main__':
    dataset = 'cancer'
    model_name = 'svm'
    X_train, X_test, y_train, y_test = data_split(dataset=dataset)
    model = select_model(name=model_name)
    fit_n_pred(model)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi