In [1]:
from sklearn.metrics import confusion_matrix
y_true = [1, 0, 1, 1, 0, 1]
y_pred = [0, 0, 1, 1, 0, 1]
confusion_matrix(y_true, y_pred) 

array([[2, 0],
       [1, 3]])

In [2]:
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
(tn, fp, fn, tp)

(np.int64(2), np.int64(0), np.int64(1), np.int64(3))

In [3]:
import numpy as np
from sklearn.metrics import accuracy_score
y_pred = np.array([0, 1, 1, 0])
y_true = np.array([0, 1, 0, 0])
sum(y_true == y_pred) / len(y_true) 

np.float64(0.75)

In [4]:
accuracy_score(y_true, y_pred)

0.75

In [5]:
import numpy as np
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

y_pred = np.array([0, 1, 1, 0, 1, 1, 1, 0])
y_true = np.array([0, 1, 0, 0, 0, 0, 1, 1])

In [6]:
precision_score(y_true, y_pred) # 정밀도

0.4

In [7]:
recall_score(y_true, y_pred)    # 민감도

0.6666666666666666

In [8]:
f1_score(y_true, y_pred)        # F1 스코어

0.5

In [9]:
def sigmoid(z):
    return 1 / (1 + np.exp(z))

In [10]:
def hypothesis_function(x, theta):
    z = (np.dot(-x,theta)) 
    return sigmoid(z)

In [12]:
def compute_cost(x, y, theta):
    m = y.shape[0]
    J = (-1.0 / m) * (
        y.T.dot(np.log(hypothesis_function(x,theta))) + 
        (1-y).T.dot(np.log(1- hypothesis_function(x,theta))))
 
    return J

In [None]:
def minimize_gradient(x, y, theta, iterations=100000, alpha=0.01):
    m = y.size
    cost_history = []
    theta_history = []
 
    for _ in range(iterations):
        original_theta = theta
        for i in range(theta.size):
            partial_marginal = x[:, i].reshape(x.shape[0], 1)
            delta = hypothesis_function(x, original_theta) - y
            grad_i = delta.T.dot(partial_marginal)
            
            theta[i] = theta[i] - (alpha * grad_i)
 
        if (_ % 100) == 0:
            theta_history.append(theta)
            cost_history.append(compute_cost(x, y, theta))
    return theta, np.array(cost_history),np.array(theta_history)

## 로지스틱 회귀 실습

### 데이터 불러오기

In [14]:
import pandas as pd
df = pd.read_table("data_ML/uva.txt")
df.dtypes

who                      object
Newbie                    int64
Age                     float64
Gender                   object
Household Income         object
Sexual Preference        object
Country                  object
Education Attainment     object
Major Occupation         object
Marital Status           object
Years on Internet        object
dtype: object

In [15]:
df

Unnamed: 0,who,Newbie,Age,Gender,Household Income,Sexual Preference,Country,Education Attainment,Major Occupation,Marital Status,Years on Internet
0,id74364,0,54.0,Male,$50-74,Gay male,Ontario,Some College,Computer,Other,4-6 yr
1,id84505,0,39.0,Female,Over $100,Heterosexual,Sweden,Professional,Other,Other,1-3 yr
2,id84509,1,49.0,Female,$40-49,Heterosexual,Washington,Some College,Management,Other,Under 6 mo
3,id87028,1,22.0,Female,$40-49,Heterosexual,Florida,Some College,Computer,Married,6-12 mo
4,id76087,0,20.0,Male,$30-39,Bisexual,New Jersey,Some College,Education,Single,1-3 yr
...,...,...,...,...,...,...,...,...,...,...,...
19578,id83400,0,22.0,Male,Over $100,Heterosexual,Texas,Some College,Education,Single,4-6 yr
19579,id72216,0,19.0,Male,,Heterosexual,New Jersey,Some College,Education,Single,4-6 yr
19580,id8654,0,49.0,Female,$50-74,Heterosexual,Missouri,Doctoral,Education,Married,1-3 yr
19581,id84503,1,42.0,Female,$50-74,Heterosexual,Kentucky,Some College,Other,Married,Under 6 mo


### 전처리

In [22]:
df.pop('who') 
df.pop('Country')
df.pop('Years on Internet')

df.dtypes

Newbie                     int64
Age                      float64
Gender                  category
Household Income        category
Sexual Preference       category
Education Attainment    category
Major Occupation        category
Marital Status          category
dtype: object

In [23]:
category_cols = ["Gender", 'Household Income', 
                 'Sexual Preference', 'Education Attainment', 
                 'Major Occupation', "Marital Status"]

for col in category_cols:
    df[col] = df[col].astype('category')
 
df.dtypes

Newbie                     int64
Age                      float64
Gender                  category
Household Income        category
Sexual Preference       category
Education Attainment    category
Major Occupation        category
Marital Status          category
dtype: object

In [25]:
df_onehot = pd.get_dummies(df)
df_onehot.shape

(19583, 38)

In [26]:
df_onehot

Unnamed: 0,Newbie,Age,Gender_Female,Gender_Male,Household Income_$10-19,Household Income_$20-29,Household Income_$30-39,Household Income_$40-49,Household Income_$50-74,Household Income_$75-99,...,Major Occupation_Education,Major Occupation_Management,Major Occupation_Other,Major Occupation_Professional,Marital Status_Divorced,Marital Status_Married,Marital Status_Other,Marital Status_Separated,Marital Status_Single,Marital Status_Widowed
0,0,54.0,False,True,False,False,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
1,0,39.0,True,False,False,False,False,False,False,False,...,False,False,True,False,False,False,True,False,False,False
2,1,49.0,True,False,False,False,False,True,False,False,...,False,True,False,False,False,False,True,False,False,False
3,1,22.0,True,False,False,False,False,True,False,False,...,False,False,False,False,False,True,False,False,False,False
4,0,20.0,False,True,False,False,True,False,False,False,...,True,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19578,0,22.0,False,True,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,False
19579,0,19.0,False,True,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,False
19580,0,49.0,True,False,False,False,False,False,True,False,...,True,False,False,False,False,True,False,False,False,False
19581,1,42.0,True,False,False,False,False,False,True,False,...,False,False,True,False,False,True,False,False,False,False


In [28]:
df_onehot.isnull().sum()

Newbie                                 0
Age                                  561
Gender_Female                          0
Gender_Male                            0
Household Income_$10-19                0
Household Income_$20-29                0
Household Income_$30-39                0
Household Income_$40-49                0
Household Income_$50-74                0
Household Income_$75-99                0
Household Income_Over $100             0
Household Income_Under $10             0
Sexual Preference_Bisexual             0
Sexual Preference_Gay male             0
Sexual Preference_Heterosexual         0
Sexual Preference_Lesbian              0
Sexual Preference_Transgender          0
Sexual Preference_na                   0
Education Attainment_College           0
Education Attainment_Doctoral          0
Education Attainment_Grammar           0
Education Attainment_High School       0
Education Attainment_Masters           0
Education Attainment_Other             0
Education Attain

In [29]:
df_onehot.loc[
    pd.isnull(df_onehot['Age']), "Age"] = df_onehot['Age'].mean()

In [30]:
df_onehot.isnull().sum()

Newbie                               0
Age                                  0
Gender_Female                        0
Gender_Male                          0
Household Income_$10-19              0
Household Income_$20-29              0
Household Income_$30-39              0
Household Income_$40-49              0
Household Income_$50-74              0
Household Income_$75-99              0
Household Income_Over $100           0
Household Income_Under $10           0
Sexual Preference_Bisexual           0
Sexual Preference_Gay male           0
Sexual Preference_Heterosexual       0
Sexual Preference_Lesbian            0
Sexual Preference_Transgender        0
Sexual Preference_na                 0
Education Attainment_College         0
Education Attainment_Doctoral        0
Education Attainment_Grammar         0
Education Attainment_High School     0
Education Attainment_Masters         0
Education Attainment_Other           0
Education Attainment_Professional    0
Education Attainment_Some

### 데이터셋 만들기

In [31]:
x_data = df_onehot.iloc[:, 1:].values
y_data = df_onehot.iloc[:, 0].values.reshape(-1, 1)
y_data.shape, x_data.shape

((19583, 1), (19583, 37))

In [32]:
from sklearn import preprocessing # Min-Max Standardzation

min_max_scaler = preprocessing.MinMaxScaler()
x_data = min_max_scaler.fit_transform(x_data)

In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
      x_data, y_data, test_size=0.33, random_state=42)

X_train.shape, X_test.shape

((13120, 37), (6463, 37))

### 모델학습

In [34]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(fit_intercept=True)
logreg.fit(X_train, y_train.flatten())

In [35]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, 
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2', 
                   random_state=None, solver='warn', tol=0.0001, verbose=0, warm_start=False)

### 예측과 성능지표

In [36]:
logreg.predict(X_test[:5])

array([0, 0, 0, 0, 0])

In [37]:
logreg.predict_proba(X_test[:5])

array([[0.56804624, 0.43195376],
       [0.91113996, 0.08886004],
       [0.79499294, 0.20500706],
       [0.85913521, 0.14086479],
       [0.62787685, 0.37212315]])

In [38]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

y_true = y_test.copy()
y_pred = logreg.predict(X_test)
confusion_matrix(y_true, y_pred)


array([[4488,  274],
       [1351,  350]])

In [39]:
accuracy_score(y_true, y_pred)

0.7485687761101656