In [13]:
import pandas as pd

In [14]:
import numpy as np
import sklearn.base as skbase
import sklearn.impute as skimp
import sklearn.compose as skcom
import sklearn.preprocessing as skpre
import sklearn.pipeline as skpip

#근로기간 전처리 -> ordinal
class make_ord(skbase.TransformerMixin,skbase.BaseEstimator):
  def __init__(self) -> None:
    super().__init__()
  def fit(self,X,y=None):
    self.columns_names=list(X.columns)
    return self
  def get_feature_names_out(self,input_features=None):
    return self.columns_names

  def transform(self,X,y=None):
    def trans(data):
      if data=="Unknown":
        return np.nan

      # 여기가 까다로운 부분
      if type(data)==str:
        condi_1="<" in data
        condi_2="1" in data
        condi_3="year" in data
        if all([condi_1,condi_2,condi_3]):
          return 0

        condi_4="+" in data
        condi_5="10" in data
        if all([condi_4,condi_5]):
          return 10
        else:
          result=[]
          for letter in data:
            if letter.isnumeric():
              result.append(letter)
          return int("".join(result))
    return X.applymap(trans)

#대출기간 -> numeric
class make_num(skbase.BaseEstimator,skbase.TransformerMixin):
  def __init__(self) -> None:
    super().__init__()
  def fit(self,X,y=None):
    try:
      self.columns=X.columns
    except:
      pass
    return self
  def get_feature_names_out(self,input_features=None):
    return self.columns
  def transform(self,X,y=None):
    def trans(data):
      temp=[]
      for letter in data:
        if letter.isdigit():
          temp.append(letter)
      return float("".join(temp))
    return list(map(trans,X))

class passthrough(skbase.BaseEstimator,skbase.TransformerMixin):
  def __init__(self) -> None:
    super().__init__()
  def fit(self,X,y=None):
      self.columns=X.columns
      return self
  def get_feature_names_out(self):
    return self.columns
  def transform(self,X,y=None):
    return X

class preprocess(skbase.BaseEstimator,skbase.TransformerMixin):
  def __init__(self) -> None:
    super().__init__()
  def fit(self,X,y=None):
    self.numeric_columns=X.select_dtypes(np.number).columns
    self.cate_columns=X.select_dtypes("object").columns.difference(["대출등급",'ID',"근로기간","대출기간"])
    self.target_column="대출등급"

    numeric_pipe=skpip.make_pipeline(skimp.SimpleImputer(strategy="mean"),skpre.StandardScaler())
    cate_pipe=skpip.make_pipeline(skimp.SimpleImputer(strategy="most_frequent"),skpre.OneHotEncoder(handle_unknown="ignore",drop="first"))
    ord_pipe=skpip.make_pipeline(make_ord(),skimp.SimpleImputer(strategy="most_frequent"))
    #cate_to_num_pip=skpip.make_pipeline(skimp.SimpleImputer(strategy="most_frequent"),make_num(),skpre.StandardScaler())

    model=skcom.make_column_transformer((numeric_pipe,self.numeric_columns),
                     (cate_pipe,self.cate_columns,),
                     (passthrough(),["대출등급"]),
                     (ord_pipe,["근로기간"]),
                     #(cate_to_num_pip,["대출기간"]),
                     remainder="drop")
    model.fit(X,y=None)
    self.model=model
    return self.model
  def transform(self,X,y=None):
    return self.model.transform(X)

  def get_feature_names_out(self):
    result=[]
    model=self.model
    for step in model.named_transformers_:
      try:
        temp=model.named_transformers_[step].get_feature_names_out()
        result.extend(temp)
      except:
        continue
    return result

## 모델링

- multi classifier

In [15]:
train=pd.read_csv('/Users/ijeonghun/Documents/projects/dacon/고객 대출등급 분류 해커톤/train.csv')

In [16]:
import sklearn.multiclass as skmulc
import sklearn.multioutput as skmult
import sklearn.svm as sksvm

### 고려사항

- 종속변수의 데이터 불균형 
- > one vs one + UnderSampling
- > one vs one + class weight

In [17]:
process_model=preprocess()
process_model.fit(train)
use_train=pd.DataFrame(data=process_model.transform(train),columns=process_model.get_feature_names_out())

  return X.applymap(trans)
  return X.applymap(trans)


In [18]:
num_=[]
for x in use_train["대출등급"].unique():
    num_.append({"Class":x,"Num":len(use_train[use_train["대출등급"]==x])})
df_=pd.DataFrame(num_)
df_.sort_values("Class")

Unnamed: 0,Class,Num
2,A,16772
1,B,28817
0,C,27623
3,D,13354
6,E,7354
4,F,1954
5,G,420


#### 1. 엔트로피 가중치

In [19]:
class ClassWeight(skbase.BaseEstimator,skbase.TransformerMixin):
    def __init__(self) -> None:
        super().__init__()
    def fit(self,X:pd.DataFrame,y:str):
        class_counts=np.array(X[y].value_counts())
        def calculate_entropy_weight(class_counts):
            total_samples = np.sum(class_counts)
            class_probabilities = class_counts / total_samples
            class_entropy = -np.sum(class_probabilities * np.log2(class_probabilities + 1e-10))
            entropy_weight = 1 / class_entropy
            return class_probabilities * entropy_weight
        class_weights=calculate_entropy_weight(class_counts)
        self.entropy_weight=dict(zip(X[y].unique(),class_weights))
        return self
    def transform(self,X:pd.DataFrame,y:str):
        return self.entropy_weight

In [20]:
entropy_=ClassWeight()
entropy_.fit(use_train,"대출등급")
weights=entropy_.transform(use_train,"대출등급")

In [21]:
weights

{'C': 0.12989897617229895,
 'B': 0.1245167581221992,
 'A': 0.07560348503875484,
 'D': 0.06019609701929001,
 'F': 0.03314977515949219,
 'G': 0.008808085485674156,
 'E': 0.0018932425301858475}

In [22]:
import itertools as it
colab_target=list(it.combinations(use_train["대출등급"].unique(),2))
import imblearn as im
UnderSample=im.under_sampling.RandomUnderSampler(random_state=10,)

## 모델 만들기

- svm

In [23]:
import sklearn.model_selection as skmod
myclassifier=sksvm.SVC(class_weight=weights,decision_function_shape="ovo",random_state=10,)
rs_=skmod.GridSearchCV(estimator=myclassifier,param_grid={'kernel':["rbf","linear","sigmoid"]},cv=3,scoring="f1_macro",n_jobs=-1,return_train_score=True,verbose=True)

In [24]:
result_history=rs_.fit(X=use_train[use_train.columns.difference(["대출등급"])],y=use_train["대출등급"])

Fitting 3 folds for each of 3 candidates, totalling 9 fits


In [25]:
result_history.best_estimator_

In [26]:
result_history.best_score_

0.2919130455000067

In [27]:
pd.DataFrame(result_history.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,479.782684,1.931163,104.789781,0.110134,rbf,{'kernel': 'rbf'},0.178901,0.174916,0.176913,0.17691,0.001627,2,0.178231,0.176974,0.175918,0.177041,0.000945
1,348.422523,5.506389,46.17997,0.258435,linear,{'kernel': 'linear'},0.296643,0.289477,0.289619,0.291913,0.003345,1,0.295143,0.293781,0.287319,0.292081,0.003413
2,431.806182,15.384279,58.919047,2.753265,sigmoid,{'kernel': 'sigmoid'},0.132857,0.16823,0.124702,0.14193,0.018893,3,0.135796,0.166572,0.121761,0.141376,0.018715


In [28]:
import sklearn.metrics as skmet
import sklearn.dummy as skdum
test_model=skdum.DummyClassifier(strategy="most_frequent")
test_model.fit(use_train[use_train.columns.difference(["대출등급"])],y=use_train["대출등급"])

In [29]:
dum_predict=test_model.predict(use_train["대출등급"])
skmet.f1_score(y_pred=dum_predict,y_true=use_train["대출등급"],average="macro")

0.0658089901881415

In [30]:
use_train

Unnamed: 0,대출금액,연간소득,부채_대비_소득_비율,총계좌수,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,대출목적_부채 통합,...,대출목적_재생 에너지,대출목적_주요 구매,대출목적_주택,대출목적_주택 개선,대출목적_휴가,주택소유상태_MORTGAGE,주택소유상태_OWN,주택소유상태_RENT,대출등급,근로기간
0,-0.563848,-0.220218,-0.014287,-0.852449,-0.376102,-0.800303,-0.972784,-0.038438,-0.072595,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,C,6.0
1,-0.377964,0.370332,0.08789,-0.356109,-0.376102,-0.436814,-0.441082,-0.038438,-0.072595,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,B,10.0
2,-0.610319,0.020823,-0.321114,-0.935172,-0.376102,0.103276,-0.627621,-0.038438,-0.072595,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,A,5.0
3,-0.377964,0.382384,-0.127783,-0.852449,-0.376102,-0.483274,-0.624977,-0.038438,-0.072595,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,C,8.0
4,-0.029431,-0.22287,0.179044,-0.521556,-0.376102,-0.577932,-0.634409,-0.038438,-0.072595,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,B,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96289,-0.377964,1.165767,-0.299368,0.63657,-0.376102,0.147972,0.145249,-0.038438,-0.072595,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,C,10.0
96290,1.016166,0.382384,-0.423588,-0.025216,-0.376102,-0.232331,0.969667,-0.038438,-0.072595,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,E,10.0
96291,-0.377964,-0.099698,-0.242471,-0.273386,-0.376102,0.648632,-0.424781,-0.038438,-0.072595,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,A,1.0
96292,-0.261786,-0.277164,-0.061949,-0.356109,1.799906,0.540861,0.885597,-0.038438,-0.072595,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,D,5.0


In [32]:
use_model=result_history.best_estimator_

In [33]:
use_model.predict(use_train[use_train.columns.difference(["대출등급"])])