**다항식**

단항값 [x_1, x_2]을 2차 다항값으로 [1, x_1, x_2, (x_1)^2, x_1*x_2, (x_2)^2]로 변환

In [5]:
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
from sklearn.linear_model import LinearRegression

# 다항식으로 변환한 단항식 생성, [[0,1], [2,3]]의 2X2 행렬 생성
X = np.arange(4).reshape(2,2)
print('일차 단항식 계수 피처 : \n', X)

# degree = 2인 2차 다항식으로 변환하기 위해 PolynomialFeatures를 이용해 반환
poly = PolynomialFeatures(degree=2)
poly.fit(X)
poly_ftr = poly.transform(X)
print('변환된 2차 다항식 계수 피처 : \n', poly_ftr)

일차 단항식 계수 피처 : 
 [[0 1]
 [2 3]]
변환된 2차 다항식 계수 피처 : 
 [[1. 0. 1. 0. 0. 1.]
 [1. 2. 3. 4. 6. 9.]]


단순히 이런 형태가 아닌 임의로 원하는 다항식을 만드는 경우에는 별도의 함수를 정의해주어야 한다. 여기에서는 y = 1 + 2 * x_1 + 3 * (x_1)^2 + 4 * (x_2)^2로 피처를 다항 피처로 변환

In [6]:
def polynomial_func(X):
    y = 1 + 2 *X[:,0] + 3 *X[:,0]**2 + 4 *X[:,1] ** 3
    return y

X = np.arange(4).reshape(2, 2)
print('일차 단항식 계수 features : \n', X)
y = polynomial_func(X)
print('삼차 다항식 결정값 : \n', y)

일차 단항식 계수 features : 
 [[0 1]
 [2 3]]
삼차 다항식 결정값 : 
 [  5 125]


단항 피처를 가지고 3차 다항 피처로 변환한 후에, 선형회귀를 이용하여 회귀계수를 찾기

In [7]:
# 3차 다항식 변환
poly_ftr = PolynomialFeatures(degree=3).fit_transform(X)
print('3차 다항식 계수 feature : \n', poly_ftr)

# Linear Regression에 3차 다항식 계수 feature와 3차 다항식 결정값으로 학습 후 회귀 계수 확인
model = LinearRegression()
model.fit(poly_ftr, y)
print('Polynomial 회귀 계수 : \n', np.round(model.coef_, 2))
print('Polynomial 회귀 Shape : \n', model.coef_.shape)

3차 다항식 계수 feature : 
 [[ 1.  0.  1.  0.  0.  1.  0.  0.  0.  1.]
 [ 1.  2.  3.  4.  6.  9.  8. 12. 18. 27.]]
Polynomial 회귀 계수 : 
 [0.   0.18 0.18 0.36 0.54 0.72 0.72 1.08 1.62 2.34]
Polynomial 회귀 Shape : 
 (10,)


In [8]:
from sklearn.linear_model import Lasso, Ridge, ElasticNet
import numpy as np

# 데이터셋 가정
X_train = np.random.random((10, 2))
y_train = np.random.random((10, 1))

# 라쏘, 릿지, 엘라스틱넷 적용 선형 회귀 모형 선언
model_l = Lasso(alpha = 0.1)
model_r = Ridge(alpha = 0.1)
model_e = ElasticNet(alpha = 0.1, l1_ratio = 0.5)

# 선언된 모형 학습
model_l.fit(X_train, y_train)
model_r.fit(X_train, y_train)
model_e.fit(X_train, y_train)

from sklearn.metrics import mean_squared_error
y_pred_l = model_l.predict(X_train) # 예측 값
print(mean_squared_error(y_pred_l, y_train, squared = False))
y_pred_r = model_r.predict(X_train) # 예측 값
print(mean_squared_error(y_pred_r, y_train, squared = False))
y_pred_e = model_e.predict(X_train) # 예측 값
print(mean_squared_error(y_pred_e, y_train, squared = False)) # 낮을수록 좋음

0.2746375322338031
0.20294439541905393
0.2746375322338031


# Logistic Regression

In [14]:
# Step 1 : import packages, functions, and classes
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Step 2 : Get data
x = np.arange(10).reshape(-1, 1)
y= np.array([0, 1, 0, 0, 1, 1, 1, 1, 1, 1])

# Step 3 : Create a model and train it
model = LogisticRegression(solver='liblinear', C=10.0, random_state=0)
model.fit(x, y)

# Step 4 : Evaluate the model
p_pred = model.predict_proba(x)
y_pred = model.predict(x)
score_ = model.score(x, y)
conf_m = confusion_matrix(y, y_pred)
report = classification_report(y, y_pred)

# Step 5 : Print
print(p_pred)
print(y_pred)
print(score_)
print(conf_m)
print(report)

[[0.81999686 0.18000314]
 [0.69272057 0.30727943]
 [0.52732579 0.47267421]
 [0.35570732 0.64429268]
 [0.21458576 0.78541424]
 [0.11910229 0.88089771]
 [0.06271329 0.93728671]
 [0.03205032 0.96794968]
 [0.0161218  0.9838782 ]
 [0.00804372 0.99195628]]
[0 0 0 1 1 1 1 1 1 1]
0.8
[[2 1]
 [1 6]]
              precision    recall  f1-score   support

           0       0.67      0.67      0.67         3
           1       0.86      0.86      0.86         7

    accuracy                           0.80        10
   macro avg       0.76      0.76      0.76        10
weighted avg       0.80      0.80      0.80        10


# Logistic Regression 2 - Titanic

In [15]:
import pandas as pd
df = pd.read_csv("Data/titanic_train.csv")
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [16]:
print(df.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [17]:
df['Sex'] = df['Sex'].map({'female':1, 'male':0})
print(df.isnull().sum())

df['Age'].fillna(value=df['Age'].mean(), inplace=True)

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [42]:
df['FirstClass'] = df['Pclass'].apply(lambda x : 1 if x == 1 else 0)
df['SecondClass'] = df['Pclass'].apply(lambda x : 1 if x == 2 else 0)
df.head()

data = df[['Sex', 'Age', 'FirstClass', 'SecondClass']]
target = df['Survived']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_f = scaler.fit_transform(X_train.values)
X_test_f = scaler.transform(X_test.values)

In [48]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train.values, y_train)

In [53]:
print(model.score(X_train.values, y_train))

0.7829341317365269


In [54]:
print(model.score(X_test.values, y_test))

0.7937219730941704


In [55]:
print(model.coef_)

[[ 2.49430529 -0.03180804  2.22308112  0.93693463]]


In [57]:
import numpy as np

Jack = np.array([0.0, 20.0, 0.0, 0.0])
Rose = np.array([1.0, 17.0, 1.0, 0.0])
UngHoe = np.array([0.0, 23.0, 1.0, 0.0])

sample = np.array([Jack, Rose, UngHoe])
sample_f = scaler.transform(sample)
print(model.predict(sample_f))
print(model.predict_proba(sample_f))

#  UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature name 발생 시, X에 들어가는 값에 .values 붙여주기

[0 1 1]
[[0.9917001  0.0082999 ]
 [0.00344733 0.99655267]
 [0.39693989 0.60306011]]
