In [1]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from tqdm import tqdm

In [2]:
df = pd.read_csv('train.csv')

In [None]:
df1 = pd.read_csv('test.csv').drop(['Unnamed: 0', 'app_id'], axis = 1)

In [None]:
out_of_test = ['PAYMENT_SYSTEM amnt max 6', 'PAYMENT_SYSTEM amnt min 6', 'PAYMENT_SYSTEM amnt sum 6', 'PAYMENT_SYSTEM amnt var 6', 'PAYMENT_SYSTEM amnt mean 6']
for i in out_of_test:
    df1[i]=0

In [None]:
def cross_val_search(x, y, val_count):
    
    importances = {}
    
    for i in x.columns:
        importances[i] = 0
    
    for i in tqdm(range(val_count)):
        model = CatBoostClassifier(random_seed = i, silent = True)
        model.fit(x,y)
        features = model.feature_importances_
        d = 0
        
        for k in importances.keys():
            importances[k] += features[d]
            d += 1
            
    return importances
            
        

In [None]:
def get_names(columns):
    names = set()
    for i in columns:
        names.add(i.split(' ')[0])
    return list(names)

def get_df(df, name):
    columns = []
    for i in df.columns:
        if name in i:
            columns.append(i)
    return df[columns]

In [None]:
def pca_exchange(df, name, desperce):
    pca = PCA(n_components = desperce, svd_solver ='full')
    transformed = pca.fit_transform(df)
    columns = ['{} {}'.format(name, i) for i in range(transformed.shape[1])]
    return pd.DataFrame(columns = columns, data = transformed), pca

In [None]:
def test_pca(df, name, pca):
    transformed = pca.transform(df)
    columns = ['{} {}'.format(name, i) for i in range(transformed.shape[1])]
    return pd.DataFrame(columns = columns, data = transformed)

In [None]:
def prepare_train(x, y, val_count = 20, desperce = .99):
    
    x = x.fillna(0)
    
    importances = cross_val_search(x, y, val_count)
    
    val = pd.DataFrame(data = importances.values(), index = importances.keys()) 
    safe = val[val[0] > 0.1*val_count].index # получение важных фич, которые не нуждаются в изменение 
    # и которые нужно сохранить
    
    features = x[safe].copy() # сохранение важных фич
    features.index.name = 'key'
    
    scaler = StandardScaler()
    scaler.fit(x)
    x = pd.DataFrame(data = scaler.transform(x), index = x.index, columns = x.columns) # стандартизируем фичи
    
    pca_save = {} # словарь, который будет хранить себе параметры pca, 
    # дабы не создавать для каждого типа фич свою переменную
    
    for name in tqdm(get_names(x.columns)):
        cols = get_df(x, name)
        new_features, pca_save[name]= pca_exchange(cols, name, desperce) # получение обработанного набора фич 
        # и сохранение pca для дальнейшего использования
        new_features.index.name = 'key'
        features = features.merge(new_features, on = 'key')
    return features, scaler, pca_save, safe

In [None]:
def prepare_test(x, scaler, pca_list, safe):
    
    x = x.fillna(0)
    
    features = x[safe].copy() # сохранение важных фич
    features.index.name = 'key'

    x = pd.DataFrame(data = scaler.transform(x), index = x.index, columns = x.columns) # стандартизируем фичи
    
    for name in tqdm(get_names(x.columns)):
        cols = get_df(x, name)
        new_features = test_pca(cols, name, pca_list[name]) # получение обработанного набора фич 
        # и сохранение pca для дальнейшего использования
        new_features.index.name = 'key'
        features = features.merge(new_features, on = 'key')
    return features

In [None]:
x = df.drop(['flag','app_id', 'Unnamed: 0'], axis = 1)
y = df['flag']
val_count = 1
train_features, scaler, pca_save, safe = prepare_train(x, y, val_count, .95)

In [None]:
train_features

In [None]:
test_features = prepare_test(df1, scaler, pca_save, safe)

In [None]:
test_features