In [107]:
import pandas as pd

In [108]:
import numpy as np
import sklearn.base as skbase
import sklearn.impute as skimp
import sklearn.compose as skcom
import sklearn.preprocessing as skpre
import sklearn.pipeline as skpip

#근로기간 전처리 -> ordinal
class make_ord(skbase.TransformerMixin,skbase.BaseEstimator):
  def __init__(self) -> None:
    super().__init__()
  def fit(self,X,y=None):
    self.columns_names=list(X.columns)
    return self
  def get_feature_names_out(self,input_features=None):
    return self.columns_names

  def transform(self,X,y=None):
    def trans(data):
      if data=="Unknown":
        return np.nan

      # 여기가 까다로운 부분
      if type(data)==str:
        condi_1="<" in data
        condi_2="1" in data
        condi_3="year" in data
        if all([condi_1,condi_2,condi_3]):
          return 0

        condi_4="+" in data
        condi_5="10" in data
        if all([condi_4,condi_5]):
          return 10
        else:
          result=[]
          for letter in data:
            if letter.isnumeric():
              result.append(letter)
          return int("".join(result))
    return X.applymap(trans)

#대출기간 -> numeric
class make_num(skbase.BaseEstimator,skbase.TransformerMixin):
  def __init__(self) -> None:
    super().__init__()
  def fit(self,X,y=None):
    try:
      self.columns=X.columns
    except:
      pass
    return self
  def get_feature_names_out(self,input_features=None):
    return self.columns
  def transform(self,X,y=None):
    def trans(data):
      temp=[]
      for letter in data:
        if letter.isdigit():
          temp.append(letter)
      return float("".join(temp))
    return list(map(trans,X))

class passthrough(skbase.BaseEstimator,skbase.TransformerMixin):
  def __init__(self) -> None:
    super().__init__()
  def fit(self,X,y=None):
      self.columns=X.columns
      return self
  def get_feature_names_out(self):
    return self.columns
  def transform(self,X,y=None):
    return X

class preprocess(skbase.BaseEstimator,skbase.TransformerMixin):
  def __init__(self) -> None:
    super().__init__()
  def fit(self,X,y=None):
    self.numeric_columns=X.select_dtypes(np.number).columns
    self.cate_columns=X.select_dtypes("object").columns.difference(["대출등급",'ID',"근로기간","대출기간"])
    self.target_column="대출등급"

    numeric_pipe=skpip.make_pipeline(skimp.SimpleImputer(strategy="mean"),skpre.StandardScaler())
    cate_pipe=skpip.make_pipeline(skimp.SimpleImputer(strategy="most_frequent"),skpre.OneHotEncoder(handle_unknown="ignore"))
    ord_pipe=skpip.make_pipeline(make_ord(),skimp.SimpleImputer(strategy="most_frequent"))
    #cate_to_num_pip=skpip.make_pipeline(skimp.SimpleImputer(strategy="most_frequent"),make_num(),skpre.StandardScaler())

    model=skcom.make_column_transformer((numeric_pipe,self.numeric_columns),
                     (cate_pipe,self.cate_columns,),
                     (passthrough(),["대출등급"]),
                     (ord_pipe,["근로기간"]),
                     #(cate_to_num_pip,["대출기간"]),
                     remainder="drop")
    model.fit(X,y=None)
    self.model=model
    return self.model
  def transform(self,X,y=None):
    return self.model.transform(X)

  def get_feature_names_out(self):
    result=[]
    model=self.model
    for step in model.named_transformers_:
      try:
        temp=model.named_transformers_[step].get_feature_names_out()
        result.extend(temp)
      except:
        continue
    return result

## 모델링

- multi classifier

In [109]:
train=pd.read_csv('/Users/ijeonghun/Documents/projects/dacon/고객 대출등급 분류 해커톤/train.csv')

In [110]:
import sklearn.multiclass as skmulc
import sklearn.multioutput as skmult
import sklearn.svm as sksvm

### 고려사항

- 종속변수의 데이터 불균형 
- > one vs one + UnderSampling
- > one vs one + class weight

In [111]:
process_model=preprocess()
process_model.fit(train)
use_train=pd.DataFrame(data=process_model.transform(train),columns=process_model.get_feature_names_out())

  return X.applymap(trans)
  return X.applymap(trans)


In [112]:
num_=[]
for x in use_train["대출등급"].unique():
    num_.append({"Class":x,"Num":len(use_train[use_train["대출등급"]==x])})
df_=pd.DataFrame(num_)
df_.sort_values("Class")

Unnamed: 0,Class,Num
2,A,16772
1,B,28817
0,C,27623
3,D,13354
6,E,7354
4,F,1954
5,G,420


#### 1. 엔트로피 가중치

In [113]:
class ClassWeight(skbase.BaseEstimator,skbase.TransformerMixin):
    def __init__(self) -> None:
        super().__init__()
    def fit(self,X:pd.DataFrame,y:str):
        class_counts=np.array(X[y].value_counts())
        def calculate_entropy_weight(class_counts):
            total_samples = np.sum(class_counts)
            class_probabilities = class_counts / total_samples
            class_entropy = -np.sum(class_probabilities * np.log2(class_probabilities + 1e-10))
            entropy_weight = 1 / class_entropy
            return class_probabilities * entropy_weight
        class_weights=calculate_entropy_weight(class_counts)
        self.entropy_weight=dict(zip(X[y].unique(),class_weights))
        return self
    def transform(self,X:pd.DataFrame,y:str):
        return self.entropy_weight

In [114]:
entropy_=ClassWeight()
entropy_.fit(use_train,"대출등급")
weights=entropy_.transform(use_train,"대출등급")

In [115]:
weights

{'C': 0.12989897617229895,
 'B': 0.1245167581221992,
 'A': 0.07560348503875484,
 'D': 0.06019609701929001,
 'F': 0.03314977515949219,
 'G': 0.008808085485674156,
 'E': 0.0018932425301858475}

In [116]:
import itertools as it
colab_target=list(it.combinations(use_train["대출등급"].unique(),2))
import imblearn as im
UnderSample=im.under_sampling.RandomUnderSampler(random_state=10,)

## 모델 만들기

- svm

In [117]:
import sklearn.model_selection as skmod
myclassifier=sksvm.SVC(class_weight=weights,decision_function_shape="ovo",random_state=10,)
rs_=skmod.GridSearchCV(estimator=myclassifier,param_grid={'kernel':["rbf","linear","sigmoid"]},cv=3,scoring="f1_macro",n_jobs=-1,return_train_score=True)

In [118]:
result_history=rs_.fit(X=use_train[use_train.columns.difference(["대출등급"])],y=use_train["대출등급"])

KeyboardInterrupt: 

In [None]:
result_history.best_estimator_

In [None]:
result_history.best_score_

0.49443371999569186

In [None]:
pd.DataFrame(result_history.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,997.451877,2.411632,95.141827,0.220227,rbf,{'kernel': 'rbf'},0.443689,0.44805,0.438444,0.439016,...,0.442094,0.003507,2,0.443604,0.441617,0.442877,0.443071,0.442365,0.442707,0.000674
1,767.518436,4.676729,43.425761,0.608733,linear,{'kernel': 'linear'},0.498416,0.498209,0.489226,0.491251,...,0.494434,0.003681,1,0.497086,0.494814,0.494334,0.493957,0.494535,0.494945,0.001106
2,532.032266,154.915005,44.70552,8.645544,sigmoid,{'kernel': 'sigmoid'},0.290617,0.299341,0.293006,0.294771,...,0.290402,0.008554,3,0.291634,0.301603,0.289284,0.287882,0.27025,0.288131,0.010148


In [None]:
import sklearn.metrics as skmet
import sklearn.dummy as skdum
test_model=skdum.DummyClassifier(strategy="most_frequent")
test_model.fit(use_train[use_train.columns.difference(["대출등급"])],y=use_train["대출등급"])

In [None]:
dum_predict=test_model.predict(use_train["대출등급"])
skmet.f1_score(y_pred=dum_predict,y_true=use_train["대출등급"],average="macro")

0.0658089901881415