In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv('train_with_profit_coeffs.csv').drop('Unnamed: 0', axis = 1)

In [2]:
test = pd.read_parquet('test_data.pqt')

# здесь находится столбец start cluster для 6го месяца, предсказзанный с помощью модели model_for_predict_6month
sixm = pd.read_parquet('test_df.pqt')

coeffs = pd.read_csv('sample_submission.csv')
cluster_weights = pd.read_excel('cluster_weights.xlsx').set_index("cluster")

In [40]:
import pickle
from typing import List, Dict, Any


class LEncoder:

    def __init__(self) -> None:
        self.d: Dict[Any, int] = {}
        self.end_point: int = 0

    def fit_transform(self, df_col: pd.Series) -> List[int]:
        tmp = df_col.unique()
        tmp.sort()
        self.end_point = len(tmp)
        self.d = dict(zip(tmp, range(self.end_point)))
        return [self.d[el] for el in df_col]

    def add_transform(self, df_col: pd.Series) -> List[int]:
        if not self.d:
            print('No fitted data')
        tmp = set(df_col) - set(self.d.keys())
        if not tmp:
            return [self.d[el] for el in df_col]
        else:
            self.d.update(zip(tmp, range(self.end_point, self.end_point + len(tmp))))
            self.end_point = self.end_point + len(tmp)
            return [self.d[el] for el in df_col]

    def untransform(self, df_col: pd.Series) -> List[Any]:
        self.un_d = {v: k for k, v in self.d.items()}
        return [self.un_d[el] for el in df_col]

    def save_encoder(self, path: str) -> None:
        with open(path, 'wb') as fp:
            pickle.dump(self.d, fp, protocol=pickle.HIGHEST_PROTOCOL)

    def load_pickle(self, path: str) -> None:
        with open(path, 'rb') as fp:
            self.d = pickle.load(fp)
        self.end_point = len(self.d)   

    def read_dict(self, d: Dict[Any, int]) -> None:
        self.d = d
        self.end_point = len(d)

    def pre_dif_encoder(self) -> 'LEncoder':
        self.d = {-1: 0, 0: 1, 1: 2}
        self.end_point = 2
        return self

#### Кодирую признаки LabelEncoder

In [42]:
from sklearn.preprocessing import LabelEncoder


channel_code_encoder = LEncoder()
train['channel_code_enc'] = channel_code_encoder.fit_transform(train['channel_code'].astype(str))

city_encoder = LEncoder()
train['city_enc'] = city_encoder.fit_transform(train['city'].astype(str))

city_type_encoder = LEncoder()
train['city_type_enc'] = city_type_encoder.fit_transform(train['city_type'].astype(str))

index_city_code_encoder = LEncoder()
train['index_city_code_enc'] = index_city_code_encoder.fit_transform(train['index_city_code'].astype(str))

ogrn_month_encoder = LEncoder()
train['ogrn_month_enc'] = ogrn_month_encoder.fit_transform(train['ogrn_month'].astype(str))

ogrn_year_encoder = LEncoder()
train['ogrn_year_enc'] = ogrn_year_encoder.fit_transform(train['ogrn_year'].astype(str))

okved_encoder = LEncoder()
train['okved_enc'] = okved_encoder.fit_transform(train['okved'].astype(str))

segment_encoder = LEncoder()
train['segment_enc'] = segment_encoder.fit_transform(train['segment'].astype(str))

start_cluster_encoder = LEncoder()
train['start_cluster_enc'] = start_cluster_encoder.fit_transform(train['start_cluster'].astype(str))

date_encoder = LEncoder()
train['date_enc'] = date_encoder.fit_transform(train['date'].astype(str))

### Отбираю столбцы

In [43]:
train_for_model = train[['balance_amt_avg', 'balance_amt_max', 'balance_amt_min', 'balance_amt_day_avg', 'ogrn_days_end_month', 'ogrn_days_end_quarter',
 'ft_registration_date', 'max_founderpres', 'min_founderpres', 'ogrn_exist_months', 'sum_of_paym_2m', 'sum_of_paym_6m', 'sum_of_paym_1y', 'sum_a_oper_1m',
 'cnt_a_oper_1m','sum_b_oper_1m','cnt_b_oper_1m','sum_c_oper_1m', 'cnt_c_oper_1m', 'sum_deb_d_oper_1m', 'cnt_deb_d_oper_1m','sum_cred_d_oper_1m', 'cnt_cred_d_oper_1m',
 'sum_deb_e_oper_1m', 'cnt_deb_e_oper_1m', 'cnt_days_deb_e_oper_1m', 'sum_cred_e_oper_1m', 'cnt_cred_e_oper_1m', 'cnt_days_cred_e_oper_1m', 'sum_deb_f_oper_1m',
 'cnt_deb_f_oper_1m', 'cnt_days_deb_f_oper_1m', 'sum_cred_f_oper_1m', 'cnt_cred_f_oper_1m','cnt_days_cred_f_oper_1m', 'sum_deb_g_oper_1m', 'cnt_deb_g_oper_1m', 'cnt_days_deb_g_oper_1m',
 'sum_cred_g_oper_1m', 'cnt_cred_g_oper_1m', 'cnt_days_cred_g_oper_1m', 'sum_deb_h_oper_1m', 'cnt_deb_h_oper_1m', 'cnt_days_deb_h_oper_1m', 'sum_cred_h_oper_1m', 'cnt_cred_h_oper_1m',
 'cnt_days_cred_h_oper_1m', 'sum_a_oper_3m', 'cnt_a_oper_3m', 'sum_b_oper_3m', 'cnt_b_oper_3m', 'sum_c_oper_3m', 'cnt_c_oper_3m', 'sum_deb_d_oper_3m', 'cnt_deb_d_oper_3m',
 'sum_cred_d_oper_3m', 'cnt_cred_d_oper_3m', 'sum_deb_e_oper_3m', 'cnt_deb_e_oper_3m', 'cnt_days_deb_e_oper_3m', 'sum_cred_e_oper_3m', 'cnt_cred_e_oper_3m', 'cnt_days_cred_e_oper_3m', 'sum_deb_f_oper_3m',
 'cnt_deb_f_oper_3m', 'cnt_days_deb_f_oper_3m', 'sum_cred_f_oper_3m', 'cnt_cred_f_oper_3m', 'cnt_days_cred_f_oper_3m', 'sum_deb_g_oper_3m', 'cnt_deb_g_oper_3m', 'cnt_days_deb_g_oper_3m', 'sum_cred_g_oper_3m', 'cnt_cred_g_oper_3m', 'cnt_days_cred_g_oper_3m',
 'sum_deb_h_oper_3m', 'cnt_deb_h_oper_3m', 'cnt_days_deb_h_oper_3m', 'sum_cred_h_oper_3m', 'cnt_cred_h_oper_3m', 'cnt_days_cred_h_oper_3m', 'profit_coeffs', 'channel_code_enc', 'city_enc',
 'city_type_enc', 'index_city_code_enc', 'ogrn_month_enc', 'ogrn_year_enc', 'okved_enc', 'segment_enc', 'start_cluster_enc', 'date_enc']]

### Объединяю суммы и counts по deb и cred, чтобы снизить размерность+ сгенерировать новые фичи

In [446]:
def oper_union(df: pd.DataFrame) -> pd.DataFrame():

    df['total_cnt_deb_3m'] = df['cnt_deb_d_oper_3m'] + df['cnt_deb_e_oper_3m']+df['cnt_deb_f_oper_3m'] + \
    df['cnt_deb_g_oper_3m']+df['cnt_deb_h_oper_3m']

    df['total_cnt_deb_1m'] = df['cnt_deb_d_oper_1m'] + df['cnt_deb_e_oper_1m']+df['cnt_deb_f_oper_1m'] + \
    df['cnt_deb_g_oper_1m']+df['cnt_deb_h_oper_1m']

    df['total_cnt_cred_3m'] = df['cnt_cred_d_oper_3m'] + df['cnt_cred_e_oper_3m']+df['cnt_cred_f_oper_3m'] + \
    df['cnt_cred_g_oper_3m']+df['cnt_cred_h_oper_3m']

    df['total_cnt_cred_1m'] = df['cnt_cred_d_oper_1m'] + df['cnt_cred_e_oper_1m']+df['cnt_cred_f_oper_1m'] + \
    df['cnt_cred_g_oper_1m']+df['cnt_cred_h_oper_1m']


    df['total_sum_deb_3m'] = df['sum_deb_d_oper_3m'] + df['sum_deb_e_oper_3m']+df['sum_deb_f_oper_3m'] + \
    df['sum_deb_g_oper_3m']+df['sum_deb_h_oper_3m']

    df['total_sum_deb_1m'] = df['sum_deb_d_oper_1m'] + df['sum_deb_e_oper_1m']+df['sum_deb_f_oper_1m'] + \
    df['sum_deb_g_oper_1m']+df['sum_deb_h_oper_1m']

    df['total_sum_cred_3m'] = df['sum_cred_d_oper_3m'] + df['sum_cred_e_oper_3m']+df['sum_cred_f_oper_3m'] + \
    df['sum_cred_g_oper_3m']+df['sum_cred_h_oper_3m']

    df['total_sum_cred_1m'] = df['sum_cred_d_oper_1m'] + df['sum_cred_e_oper_1m']+df['sum_cred_f_oper_1m'] + \
    df['sum_cred_g_oper_1m']+df['sum_cred_h_oper_1m']
    
    return df

In [None]:
train_for_model = oper_union(df=train_for_model)

In [359]:
train_new = train_for_model[train_for_model.columns[train_for_model.columns.str.contains('total')|train_for_model.columns.str.contains('balance')|train_for_model.columns.str.contains('enc')]]

In [49]:
y_train = start_cluster_encoder.add_transform(train['start_cluster'])

In [360]:
sixm['start_cluster_enc'] = start_cluster_encoder.add_transform(sixm['start_cluster'])
sixm['date_enc'] = date_encoder.add_transform(sixm['date'])

In [361]:
test=  sixm.copy()

In [362]:
test = oper_union(df=test)

In [416]:
last_m_test_df = test[test["date"] == "month_6"]
last_m_test_df = last_m_test_df.drop(["id"], axis=1)

In [417]:
last_m_test_df['channel_code_enc'] = channel_code_encoder.add_transform(last_m_test_df['channel_code'].astype(str))

last_m_test_df['city_enc'] = city_encoder.add_transform(last_m_test_df['city'].astype(str))

last_m_test_df['city_type_enc'] = city_type_encoder.add_transform(last_m_test_df['city_type'].astype(str))

last_m_test_df['index_city_code_enc'] = index_city_code_encoder.add_transform(last_m_test_df['index_city_code'].astype(str))

last_m_test_df['ogrn_month_enc'] = ogrn_month_encoder.add_transform(last_m_test_df['ogrn_month'].astype(str))

last_m_test_df['ogrn_year_enc'] = ogrn_year_encoder.add_transform(last_m_test_df['ogrn_year'].astype(str))

last_m_test_df['okved_enc'] = okved_encoder.add_transform(last_m_test_df['okved'].astype(str))

last_m_test_df['segment_enc'] = segment_encoder.add_transform(last_m_test_df['segment'].astype(str))

last_m_test_df['date_enc'] = segment_encoder.add_transform(last_m_test_df['date'].astype(str))

In [451]:
train_new[train_new.columns[train_new.columns.str.contains('enc')]] = train_new[train_new.columns[train_new.columns.str.contains('enc')]]

### Заполняю пропуски в других данных

In [449]:
for col in train_new.columns[train_new.columns.str.contains('enc')]:
    train_new[col] = train_new[col].fillna(train_new[col].mode()[0])

In [450]:
for col in train_new.columns[~train_new.columns.str.contains('enc')]:
    train_new[col] = train_new[col].fillna(np.mean(train_new[col]))

### Пробовала полиномиальные фичи, на валидации давали хороший результат, но на публичном датасете сильно снижали качество модели

In [448]:
# from sklearn.preprocessing import PolynomialFeatures
# # pf1 = PolynomialFeatures(degree = 2, include_bias = True)
# # train_new[['pf1_1', 'pf1_2', 'pf1_3', 'pf1_4', 'pf1_5', 'pf1_6']] = pf1.fit_transform(train_new[['segment_enc', 'balance_amt_avg']].values.reshape(-1,2))

# pf2 = PolynomialFeatures(degree = 2, include_bias = True)
# train_new[['pf2_1', 'pf2_2', 'pf2_3', 'pf2_4', 'pf2_5', 'pf2_6']] = pf2.fit_transform(train_new[['segment_enc', 'total_sum_cred_3m']].values.reshape(-1,2))

# pf3 = PolynomialFeatures(degree = 2, include_bias = True)
# train_new['pf3'] = pf3.fit_transform(train_new[['segment_enc', 'total_sum_deb_3m']].values.reshape(-1,2))

# pf4 = PolynomialFeatures(degree = 2, include_bias = True)
# train_new['pf4'] = pf4.fit_transform(train_new[['segment_enc', 'start_cluster_enc']].values.reshape(-1,2))

# pf5 = PolynomialFeatures(degree = 2, include_bias = True)
# train_new['pf5'] = pf5.fit_transform(train_new[['start_cluster_enc', 'total_sum_cred_3m']].values.reshape(-1,2))

# pf6 = PolynomialFeatures(degree = 2, include_bias = True)
# train_new['pf6'] = pf6.fit_transform(train_new[['start_cluster_enc', 'total_sum_deb_3m']].values.reshape(-1,2))

In [369]:
X_train, X_val, y_train, y_val = train_test_split(train_new,start_cluster_encoder.add_transform(train['end_cluster']) , 
                                   random_state=104,
                                   test_size=0.25,  
                                   shuffle=True)

In [418]:
last_m_test_df = last_m_test_df[X_train.columns]

In [419]:
for col in last_m_test_df.columns[last_m_test_df.columns.str.contains('enc')]:
    last_m_test_df[col] = last_m_test_df[col].fillna(last_m_test_df[col].mode()[0])

In [420]:
for col in last_m_test_df.columns[~last_m_test_df.columns.str.contains('enc')]:
    last_m_test_df[col] = last_m_test_df[col].fillna(np.mean(last_m_test_df[col]))

In [106]:
X_train.columns   # финальные фичи, которые дали хороший результат

Index(['balance_amt_avg', 'balance_amt_max', 'balance_amt_min',
       'balance_amt_day_avg', 'channel_code_enc', 'city_enc', 'city_type_enc',
       'index_city_code_enc', 'ogrn_month_enc', 'ogrn_year_enc', 'okved_enc',
       'segment_enc', 'start_cluster_enc', 'date_enc', 'total_cnt_deb_3m',
       'total_cnt_deb_1m', 'total_cnt_cred_3m', 'total_cnt_cred_1m',
       'total_sum_deb_3m', 'total_sum_deb_1m', 'total_sum_cred_3m',
       'total_sum_cred_1m'],
      dtype='object')

## RF

In [259]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
# import optuna

def objective_rfc(trial):    
    max_depth = trial.suggest_int("max_depth", 2, 20)
    criterion = trial.suggest_categorical("criterion", ["gini", "entropy", 'log_loss'])
    class_weight = trial.suggest_categorical("class_weight", ['balanced', 'balanced_subsample'])
    n_estimators = trial.suggest_int("n_estimators", 10, 1000)
    bootstrap = trial.suggest_categorical("bootstrap", [True,False])

    score = cross_val_score(RandomForestClassifier(max_depth=max_depth, criterion=criterion, n_estimators=n_estimators,
                                                class_weight = class_weight, bootstrap = bootstrap ),
                            X_train.iloc[:200000,:].drop(['city_type_enc', 'date_enc'], axis = 1), y_train[:200000], cv=5, scoring='roc_auc_ovr', n_jobs=-1).mean()
    return score


study = optuna.create_study(direction="maximize")
study.optimize(objective_rfc, n_trials=10)

### Лучший study.trial, подобранный по optuna, скопирован в словарь вручную, тк остановила выполнение кода и взяла лучшие параметры на 5м trial

In [439]:
best_params =  {'max_depth': 13, 'criterion': 'log_loss', 'class_weight': 'balanced_subsample', 'n_estimators': 685, 'bootstrap': True}
model_ = RandomForestClassifier(**best_params)
model_.fit(X_train.drop('segment_enc', axis = 1).iloc[:200000, :], y_train[:200000])

In [241]:
with open('alpha_rfc_model3.pkl', 'wb') as file:
    model = pickle.dump(model_,file)

In [440]:
preds = model_.predict_proba(X_val.drop('segment_enc', axis = 1))

In [128]:
def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    return sum(weights * classes_roc_auc)

In [399]:
weights_dict = cluster_weights["unnorm_weight"].to_dict()

In [441]:
weighted_roc_auc(start_cluster_encoder.untransform(y_val), preds,  start_cluster_encoder.untransform(model_.classes_), weights_dict)

0.9012590467263722

In [442]:
test_pred_proba = model_.predict_proba(last_m_test_df.drop('segment_enc', axis = 1))

In [423]:
feature_importance = pd.DataFrame(columns = ['feature_importance', 'feature'])
feature_importance['feature_importance'] = model_.feature_importances_
feature_importance['feature'] = X_train.columns
feature_importance.sort_values('feature_importance', ascending = False)

In [443]:
test_pred_proba_df = pd.DataFrame(test_pred_proba, columns=start_cluster_encoder.untransform(model_.classes_))
sorted_classes = sorted(test_pred_proba_df.columns.to_list())
test_pred_proba_df = test_pred_proba_df[sorted_classes]

In [444]:
coeffs[sorted_classes] = test_pred_proba_df
coeffs.to_csv('kek.csv', index = False)

## NB

Для Наивного Баеса также использовались выше указанные фичи, планировалось предикты модели использовать </br>
в качестве фичи для основной модели, однако roc_auc_weighted оказался слишком низким.

In [210]:
X_train, X_val, y_train, y_val = train_test_split(train_new,start_cluster_encoder.add_transform(train['end_cluster']) , 
                                   random_state=104,
                                   test_size=0.25,  
                                   shuffle=True)

In [211]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)

In [212]:
preds = clf.predict_proba(X_val)

In [213]:
weighted_roc_auc(start_cluster_encoder.untransform(y_val), preds,  start_cluster_encoder.untransform(clf.classes_), weights_dict)

0.7419366997701057

In [216]:
train.columns.tolist()

['id',
 'date',
 'balance_amt_avg',
 'balance_amt_max',
 'balance_amt_min',
 'balance_amt_day_avg',
 'channel_code',
 'city',
 'city_type',
 'index_city_code',
 'ogrn_days_end_month',
 'ogrn_days_end_quarter',
 'ogrn_month',
 'ogrn_year',
 'ft_registration_date',
 'max_founderpres',
 'min_founderpres',
 'ogrn_exist_months',
 'okved',
 'segment',
 'sum_of_paym_2m',
 'sum_of_paym_6m',
 'sum_of_paym_1y',
 'sum_a_oper_1m',
 'cnt_a_oper_1m',
 'sum_b_oper_1m',
 'cnt_b_oper_1m',
 'sum_c_oper_1m',
 'cnt_c_oper_1m',
 'sum_deb_d_oper_1m',
 'cnt_deb_d_oper_1m',
 'sum_cred_d_oper_1m',
 'cnt_cred_d_oper_1m',
 'sum_deb_e_oper_1m',
 'cnt_deb_e_oper_1m',
 'cnt_days_deb_e_oper_1m',
 'sum_cred_e_oper_1m',
 'cnt_cred_e_oper_1m',
 'cnt_days_cred_e_oper_1m',
 'sum_deb_f_oper_1m',
 'cnt_deb_f_oper_1m',
 'cnt_days_deb_f_oper_1m',
 'sum_cred_f_oper_1m',
 'cnt_cred_f_oper_1m',
 'cnt_days_cred_f_oper_1m',
 'sum_deb_g_oper_1m',
 'cnt_deb_g_oper_1m',
 'cnt_days_deb_g_oper_1m',
 'sum_cred_g_oper_1m',
 'cnt_cred_g_

In [191]:
train[['end_cluster']].head(10)

Unnamed: 0,end_cluster
0,{other}
1,{other}
2,{other}
3,{other}
4,{other}
5,{other}
6,{α}
7,{α}
8,{α}
9,{α}


### Были попытки отдельно предсказать классы с помощью KNN и использовать как фичу, но score недостаточно высокий

In [370]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(3)
knn.fit(X_train, y_train)
preds = knn.predict_proba(X_val)

In [371]:
weighted_roc_auc(start_cluster_encoder.untransform(y_val), preds,  start_cluster_encoder.untransform(knn.classes_), weights_dict)

0.8708386663007717

In [379]:
from sklearn.metrics import f1_score

In [385]:
for k in np.arange(2, 10, 1):
    clf = KNeighborsClassifier(n_neighbors=k)

    clf.fit(X_train, y_train)
    prediction = clf.predict(X_val)

    print('k:', k)
    print('f1:', f1_score(y_val, prediction, average = None))

k: 2
f1: [0.77164535 0.75007173 0.72270783 0.78692764 0.55460385 0.7535545
 0.74115456 0.73       0.71439689 0.82033863 0.76708779 0.75107296
 0.66796875 0.75       0.76368491 0.87108875 0.63492063]



KeyboardInterrupt



### Также применялась Мультиномиальная модель скрытых состояний Маркова, все float признаки кодировались с помощью sklearn.preprocessing.KBinsDiscretizer

In [None]:
model = hmm.MultinomialHMM(n_components = 17, n_iter =10, startprob_prior = temp['probs'].values)
model.fit(X_train.iloc[:400000,:])