# 유방암 환자 데이터 분석

In [8]:
import sklearn
import pandas as pd
from sklearn.datasets import make_classification
X, Y = make_classification(n_samples=1000,
                           n_features=4,
                           n_informative=2,
                           n_redundant=0,
                           random_state=0
                           )

|옵션명|	설명|
|-----|----|
|n_sample|	표본 수(default=100)|
n_features|	독립변수 수(default=20)
n_informative|	종속변수와 상관관계가 존재하는 독립변수 수(default=2)
n_redundant|	독립변수끼리 종속관계에 있는 독립변수 수(default=2)
n_repeated|	중복 독립변수 수(default=0)
n_classes|	종속변수 클래스(라벨) 수(default=2)
n_clusters_per_class|	클래스당 클러스터 수(default=2)
weights|	각 클래스에 할당 된 표본 수
random_state|	난수 생성 시드(seed) 번호

In [9]:
raw = sklearn.datasets.load_breast_cancer()
print(raw.feature_names)    # 열 이름

data = pd.DataFrame(raw.data)
target = pd.DataFrame(raw.target)
rawData = pd.concat([data, target], axis=1)
rawData.columns=['mean radius', 'mean texture', 'mean perimeter', 'mean area',
 'mean smoothness', 'mean compactness', 'mean concavity',
 'mean concave points', 'mean symmetry', 'mean fractal dimension',
 'radius error', 'texture error', 'perimeter error', 'area error',
 'smoothness error', 'compactness error', 'concavity error',
 'concave points error', 'symmetry error', 'fractal dimension error',
 'worst radius', 'worst texture', 'worst perimeter', 'worst area',
 'worst smoothness', 'worst compactness', 'worst concavity',
 'worst concave points', 'worst symmetry', 'worst fractal dimension'
 , 'cancer']
rawData.head(10)

['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,cancer
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0
5,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,...,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244,0
6,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,0.05742,...,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368,0
7,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451,...,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151,0
8,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389,...,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072,0
9,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243,...,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075,0


In [13]:
x = rawData[['mean radius', 'mean texture']]
y = rawData[['cancer']]
# 오버피팅 방지

### 모델링
* **모델링**  :어떤 대상을 본떠 만듦
  * 실체가 아님
  * 실체와 비슷한 대체물
  * 모델을 만들면 실제 대상보다 더 편하게 활용 가능
  * 즉, 일종의 데이터를 압축하는 과정

### 신호 / 잡음
* 신호와 잡음을 구분하기 위해서는 모델링 대상이 무엇인지 명확히 정의하는것이 필요함
![4가지 모델](./resources/image/model_graph.PNG)


### 오버피팅 / 언더 피팅
* **언더 피팅** : `모델링 대상`을 설명하기에 필요한 신호를 충분히 모델에 반영하지 못한 상태
* **오버 피팅** : `모델링 대상`을 설명하는데 불필요한 잡음을 과도하게 모델에 반영한 상태

In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=0)

### ML의 Train / Test

**Train** : 우리가 가지고 있는 정보<br>
**Test** : 만든 모델의 실력 평가
- 같은 데이터라도 자르는 방식에 따라 모델이 달라질 수 있음

# 선형 회귀 분석

In [18]:
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
clf.fit(x_train,y_train)  # 모수 추정
clf.coef_                 # 추정 된 모수 확인(상수항 제외)
clf.intercept_            # 추정 된 상수항 확인
clf.predic(x_test)        # 예측
clf.score(x_test, y_test) # 모형 성능 평가

0.6092200214592733