In [46]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [47]:
df = pd.read_csv('german.csv') #독일신용평가데이터
#비어있는 데이터 평균치로 채우기
df['MonthlyIncome'].fillna(value=df['MonthlyIncome'].mean(), inplace=True)
df['NumberOfDependents'].fillna(value=df['NumberOfDependents'].mean(), inplace=True)

In [48]:
#데이터 분류
features = df[['RevolvingUtilizationOfUnsecuredLines','age','59DaysPastDueNotWorse','DebtRatio',
'MonthlyIncome','NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
'NumberRealEstateLoansOrLines','89DaysPastDueNotWorse','NumberOfDependents']]
worse = df['Worse']
train_features, test_features, train_labels, test_labels = train_test_split(features, worse)

In [49]:
#데이터 스케일링 작업
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)

In [50]:
#모델 생성
model = LogisticRegression()
model.fit(train_features, train_labels)
print("훈련 데이터 정확도 : " + str(model.score(train_features, train_labels)))
print("테스트 데이터 정확도 : " + str(model.score(test_features, test_labels)))

훈련 데이터 정확도 : 0.9338311111111111
테스트 데이터 정확도 : 0.934


In [45]:
#각 독립변수들의 계수
print(model.coef_)

[[-0.00397787 -0.42261036  2.02204317 -0.04060423 -0.41465149 -0.04060333
   1.88228244  0.07241396 -3.75075226  0.1056979 ]]


In [62]:
#예측
t_data = pd.read_csv('german_test.csv')
t_data['MonthlyIncome'].fillna(value=t_data['MonthlyIncome'].mean(), inplace=True)
t_data['NumberOfDependents'].fillna(value=t_data['NumberOfDependents'].mean(), inplace=True)
input_data = t_data[['RevolvingUtilizationOfUnsecuredLines','age','59DaysPastDueNotWorse','DebtRatio',
'MonthlyIncome','NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
'NumberRealEstateLoansOrLines','89DaysPastDueNotWorse','NumberOfDependents']]

input_data = scaler.transform(input_data.head(30))

print("30개 테스트 데이터 예측 결과 : " + str(model.predict(input_data)))
print("확률 : " + str(model.predict_proba(input_data)))

30개 테스트 데이터 예측 결과 : [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
확률 : [[0.94003019 0.05996981]
 [0.94954652 0.05045348]
 [0.95205804 0.04794196]
 [0.8751248  0.1248752 ]
 [0.89453468 0.10546532]
 [0.95980101 0.04019899]
 [0.92391053 0.07608947]
 [0.9562458  0.0437542 ]
 [0.9706151  0.0293849 ]
 [0.23269894 0.76730106]
 [0.92540596 0.07459404]
 [0.94579531 0.05420469]
 [0.96944688 0.03055312]
 [0.90150412 0.09849588]
 [0.95557348 0.04442652]
 [0.91661734 0.08338266]
 [0.96823457 0.03176543]
 [0.92379466 0.07620534]
 [0.95757022 0.04242978]
 [0.93953388 0.06046612]
 [0.94704438 0.05295562]
 [0.97020124 0.02979876]
 [0.96769949 0.03230051]
 [0.94037523 0.05962477]
 [0.91694056 0.08305944]
 [0.95393056 0.04606944]
 [0.9350788  0.0649212 ]
 [0.97063063 0.02936937]
 [0.9147581  0.0852419 ]
 [0.94413362 0.05586638]]
