In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [59]:
train = pd.read_csv('/content/drive/MyDrive/184702-tu-ml-ws-23-loan/loan-10k.lrn.csv')

### Data Preprocessing

In [19]:
def featureEnginerring(df):
  # Convert Categorical variable with Numerical values
  from sklearn.preprocessing import LabelEncoder

  loan_CategoricalColumns = df.select_dtypes(include=['object']).columns
  for column in loan_CategoricalColumns:
      label_encoder = LabelEncoder()
      df[column] = label_encoder.fit_transform(df[column])

  # Date-Time Features
  df['issue_d_month'] = df['issue_d_month'] + 1
  df['earliest_cr_line_month'] = df['earliest_cr_line_month'] + 1
  df['last_pymnt_d_month'] = df['last_pymnt_d_month'] + 1
  df['last_credit_pull_d_month'] = df['last_credit_pull_d_month'] + 1

  loan_start_date = pd.to_datetime(df['issue_d_year'].astype(str) + '-' + df['issue_d_month'].astype(str))
  last_pymnt_date = pd.to_datetime(df['last_pymnt_d_year'].astype(str) + '-' + df['last_pymnt_d_month'].astype(str))
  df['loan_age'] = (last_pymnt_date - loan_start_date).dt.days

  credit_start_date = pd.to_datetime(df['earliest_cr_line_year'].astype(str) + '-' + df['earliest_cr_line_month'].astype(str))
  last_credit_pull_date = pd.to_datetime(df['last_credit_pull_d_year'].astype(str) + '-' + df['last_credit_pull_d_month'].astype(str))
  df['credit_history_length'] = (last_credit_pull_date - credit_start_date).dt.days

  columnsToDrop = ['issue_d_year','issue_d_month','earliest_cr_line_year','earliest_cr_line_month','last_pymnt_d_month','last_pymnt_d_year','last_credit_pull_d_month','last_credit_pull_d_year']
  df=df.drop(columnsToDrop,axis=1)

  return df

In [20]:
train = featureEnginerring(train)

### Model1 : LGBMClassifier

In [22]:
ycol = 'grade'
feature_names = list(
    filter(lambda x: x not in [ycol, 'ID'], train.columns))

X = train[feature_names]
y = train['grade']

In [24]:
train_data_all,test_data,train_y_all,test_y = \
                train_test_split(X, y,test_size=0.2,random_state=1,shuffle=True,stratify=df.grade)
# 再从训练集中分割出训练集与验证集
train_data,val_data,train_y,val_y = \
                train_test_split(train_data_all, train_y_all,test_size=0.2,random_state=1,shuffle=True, stratify=train_y_all)

In [25]:
clf = lgb.LGBMClassifier(objective="multiclass", n_estimators=10)
clf.fit(train_data,train_y,eval_set=[(val_data,val_y)])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003011 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9365
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 78
[LightGBM] [Info] Start training from score -1.702719
[LightGBM] [Info] Start training from score -1.244361
[LightGBM] [Info] Start training from score -1.207625
[LightGBM] [Info] Start training from score -1.932100
[LightGBM] [Info] Start training from score -2.780117
[LightGBM] [Info] Start training from score -4.010463
[LightGBM] [Info] Start training from score -5.153135


In [26]:
y_pred = clf.predict(test_data)

from sklearn.metrics import classification_report
print(classification_report(test_y, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.97      0.97       364
           1       0.95      0.96      0.95       576
           2       0.96      0.95      0.96       598
           3       0.88      0.90      0.89       290
           4       0.73      0.77      0.75       124
           5       0.73      0.59      0.66        37
           6       0.50      0.64      0.56        11

    accuracy                           0.93      2000
   macro avg       0.82      0.83      0.82      2000
weighted avg       0.93      0.93      0.93      2000



In [36]:
clf.booster_.save_model("model.txt")
model_txt = lgb.Booster(model_file='model.txt') # 注意这里指定model_fil

In [77]:
test = pd.read_csv('/content/drive/MyDrive/184702-tu-ml-ws-23-loan/loan-10k.tes.csv')
test = featureEnginerring(test)
prediction = test[['ID']]

In [78]:
y_pred_txt = model_txt.predict(test.drop('ID', axis=1))
y_pred_txt = np.argmax(y_pred_txt, axis=1)

In [82]:
prediction = pd.concat([prediction, pd.DataFrame(y_pred_txt, columns=['prediction'])], axis=1)
mapping = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F'}
prediction['grade'] = prediction['prediction'].map(mapping)

In [83]:
prediction.drop(['prediction'], axis=1, inplace=True)

In [84]:
prediction

Unnamed: 0,ID,grade
0,3582,B
1,60498,C
2,53227,B
3,21333,C
4,3885,F
...,...,...
9995,89555,C
9996,88135,B
9997,51888,C
9998,51380,D


In [54]:
prediction.to_csv('sol.csv', index=False)