In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns



In [18]:
df = pd.read_csv('../184702-tu-ml-ws-23-loan/loan-10k.lrn.csv')

print(df.shape)
df.head()

(10000, 92)


Unnamed: 0,ID,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,emp_length,home_ownership,annual_inc,...,debt_settlement_flag,issue_d_month,issue_d_year,earliest_cr_line_month,earliest_cr_line_year,last_pymnt_d_month,last_pymnt_d_year,last_credit_pull_d_month,last_credit_pull_d_year,grade
0,24341,12500.0,12500.0,12500.0,36 months,7.21,387.17,< 1 year,MORTGAGE,81000.0,...,N,6,2018,6,2000,2,2019,2,2019,A
1,67534,33850.0,33850.0,33775.0,60 months,20.99,915.57,1 year,MORTGAGE,80000.0,...,N,10,2015,9,1984,2,2019,2,2019,E
2,35080,10000.0,10000.0,10000.0,60 months,20.0,264.94,< 1 year,RENT,36580.0,...,N,9,2017,10,2006,1,2018,11,2018,D
3,4828,20250.0,20250.0,20250.0,36 months,14.31,695.15,9 years,RENT,48700.0,...,N,0,2015,6,1996,6,2016,9,2017,C
4,59259,25000.0,25000.0,25000.0,36 months,14.99,866.52,1 year,MORTGAGE,85000.0,...,N,11,2016,0,2002,2,2019,2,2019,C


issue_d_month: The month which the loan was funded
earliest_cr_line_month: The month the borrower's earliest reported credit line was opened
last_pymnt_d_month: The month of the last payment 

In [6]:
df.describe().T.style.bar(subset=["mean"],color="#606ff2").background_gradient(subset=["std"],cmap="PuBu").background_gradient(subset=["50%"],cmap="PuBu")

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,10000.0,50043.4302,28982.440166,0.0,24873.25,50033.5,75261.5,99999.0
loan_amnt,10000.0,15631.1525,9314.246117,1000.0,8350.0,14000.0,21000.0,40000.0
funded_amnt,10000.0,15631.1525,9314.246117,1000.0,8350.0,14000.0,21000.0,40000.0
funded_amnt_inv,10000.0,15625.9925,9312.426843,1000.0,8343.75,14000.0,21000.0,40000.0
int_rate,10000.0,13.216959,4.855838,5.31,9.75,12.73,16.01,30.99
installment,10000.0,461.084183,269.903751,30.12,263.77,396.78,616.3375,1717.63
annual_inc,10000.0,82128.966209,67692.836954,5000.0,50000.0,70000.0,97000.0,3200000.0
dti,10000.0,19.089188,9.523219,0.0,12.4375,18.33,24.93,168.52
delinq_2yrs,10000.0,0.3034,0.841201,0.0,0.0,0.0,0.0,16.0
fico_range_low,10000.0,698.1795,32.170977,660.0,675.0,690.0,715.0,845.0


## Data Preposesing

#### Data Cleaning

In [7]:
df.isna().sum()

ID                          0
loan_amnt                   0
funded_amnt                 0
funded_amnt_inv             0
term                        0
                           ..
last_pymnt_d_month          0
last_pymnt_d_year           0
last_credit_pull_d_month    0
last_credit_pull_d_year     0
grade                       0
Length: 92, dtype: int64

#### Feature Engineering


In [19]:
# Convert Categorical variable with Numerical values
from sklearn.preprocessing import LabelEncoder

loan_CategoricalColumns = df.select_dtypes(include=['object']).columns

for column in loan_CategoricalColumns:
    label_encoder = LabelEncoder()
    df[column] = label_encoder.fit_transform(df[column])


In [20]:
# Date-Time Features
df['issue_d_month'] = df['issue_d_month'] + 1
df['earliest_cr_line_month'] = df['earliest_cr_line_month'] + 1
df['last_pymnt_d_month'] = df['last_pymnt_d_month'] + 1
df['last_credit_pull_d_month'] = df['last_credit_pull_d_month'] + 1

loan_start_date = pd.to_datetime(df['issue_d_year'].astype(str) + '-' + df['issue_d_month'].astype(str))
last_pymnt_date = pd.to_datetime(df['last_pymnt_d_year'].astype(str) + '-' + df['last_pymnt_d_month'].astype(str))
df['loan_age'] = (last_pymnt_date - loan_start_date).dt.days

credit_start_date = pd.to_datetime(df['earliest_cr_line_year'].astype(str) + '-' + df['earliest_cr_line_month'].astype(str))
last_credit_pull_date = pd.to_datetime(df['last_credit_pull_d_year'].astype(str) + '-' + df['last_credit_pull_d_month'].astype(str))
df['credit_history_length'] = (last_credit_pull_date - credit_start_date).dt.days

columnsToDrop = ['issue_d_year','issue_d_month','earliest_cr_line_year','earliest_cr_line_month','last_pymnt_d_month','last_pymnt_d_year','last_credit_pull_d_month','last_credit_pull_d_year']
df=df.drop(columnsToDrop,axis=1)


In [26]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
import lightgbm as lgb


In [21]:

ycol = 'grade'
feature_names = list(
    filter(lambda x: x not in [ycol], df.columns))

X = df[feature_names]
y = df['grade']

train_data,test_data,train_y,test_y = \
                train_test_split(X,y,test_size=0.2,random_state=1,shuffle=True,stratify=df.grade)



In [24]:
model = lgb.LGBMClassifier(objective='multiclass',
                           boosting_type='gbdt',
                           tree_learner='serial',
                           num_leaves=64,
                           max_depth=8,
                           learning_rate=0.02,
                           n_estimators=10000,
                           subsample=0.8,
                           feature_fraction=0.8,
                           reg_alpha=0.3,
                           reg_lambda=0.5,
                           random_state=2020,
                           is_unbalance=True)

oof = []
prediction = test_data[['ID']]
prediction[f'{ycol}_0'] = 0
prediction[f'{ycol}_1'] = 0
df_importance_list = []

def f1_score_custom(y_true, y_pred):
    y_pred = y_pred.round()
    return 'f1', f1_score(y_true, y_pred), True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction[f'{ycol}_0'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction[f'{ycol}_1'] = 0


In [27]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)

In [34]:
feature_names = list(
    filter(lambda x: x not in [ycol,'ID'], df.columns))

In [37]:
# merge train_data and train_y
train = pd.concat([train_data,train_y],axis=1)

In [40]:
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train_data[feature_names], train_y)):
    X_train = train_data.iloc[trn_idx]
    Y_train = train_y.iloc[trn_idx]

    X_val = train_data.iloc[val_idx]
    Y_val = train_y.iloc[val_idx]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))
    

    
    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=500,
                          eval_metric=lambda y_true, y_pred: f1_score_custom(y_true, y_pred),
                          early_stopping_rounds=50)

    pred_val = lgb_model.predict_proba(
        X_val, num_iteration=lgb_model.best_iteration_)
    
    df_oof = train.iloc[val_idx][['ID', ycol]].copy()
    df_oof[f'{ycol}_0'] = pred_val[:,0]
    df_oof[f'{ycol}_1'] = pred_val[:,1]
    oof.append(df_oof)

    pred_test = lgb_model.predict_proba(
        test[feature_names], num_iteration=lgb_model.best_iteration_)
    prediction[f'{ycol}_0'] += pred_test[:,0] / kfold.n_splits
    prediction[f'{ycol}_1'] += pred_test[:,1] / kfold.n_splits

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()

df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance

: 

In [1]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb

X = df[feature_names]
y = df['grade']



In [7]:
train_data_all,test_data,train_y_all,test_y = \
                train_test_split(X, y,test_size=0.2,random_state=1,shuffle=True,stratify=df.grade)
# 再从训练集中分割出训练集与验证集
train_data,val_data,train_y,val_y = \
                train_test_split(train_data_all, train_y_all,test_size=0.2,random_state=1,shuffle=True, stratify=train_y_all)

In [8]:
clf = lgb.LGBMClassifier(objective="multiclass", n_estimators=10)

In [1]:
clf.fit(train_data,train_y,early_stopping_rounds=5,eval_set=[(val_data,val_y)],verbose=10)

NameError: name 'clf' is not defined

In [None]:
# 模型预测
y_pred = clf.predict(test_data)

# 查看分类结果
from sklearn.metrics import classification_report
print(classification_report(test_y, y_pred))

In [None]:
# 将任务1训练得到的模型，使用pickle进行保存
import pickle
pickle.dump(booster, open('model.pickle', 'wb'))

# 从pickle加载模型
model_pickle = pickle.load(open('model.pickle','rb'))
# 模型预测
y_pred_pickle = model_pickle.predict(test_data)
# 预测结果
y_pred_pickle