https://drive.google.com/drive/folders/1tsK9jGWic1Hei8N_M2IX63sxMCLj8PZR?usp=sharing

1. Visit the above Drive folder.
2. Right click on folder name -> "Add a shortcut to Drive" -> select "My Drive" -> "Add shortcut".

In [1]:
try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

from pathlib import Path
# Determine the locations of auxiliary libraries and datasets.
if IN_COLAB:
    google.colab.drive.mount("/content/drive")
    
    # Change this if you created the shortcut in a different location
    AUX_DATA_ROOT = Path("/content/drive/My Drive/TINKOFF_DATA")
    
    assert AUX_DATA_ROOT.is_dir(), "Have you forgot to 'Add a shortcut to Drive'?"
    
    import sys
    sys.path.insert(0, str(AUX_DATA_ROOT))
else:
    AUX_DATA_ROOT = Path(".")

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from calendar import monthrange

# DATA

In [3]:
transactions_path = f"{AUX_DATA_ROOT}/avk_hackathon_data_transactions.csv"
transactions = pd.read_csv(transactions_path)

balance_path = f"{AUX_DATA_ROOT}/avk_hackathon_data_account_x_balance.csv" 
balance = pd.read_csv(balance_path)

# products_path = f"{AUX_DATA_ROOT}/avk_hackathon_data_party_products.csv" 
# products = pd.read_csv(products_path)

socdem_path = f"{AUX_DATA_ROOT}/avk_hackathon_data_party_x_socdem.csv"  
socdem = pd.read_csv(socdem_path)

#story_logs_path = f"{AUX_DATA_ROOT}/avk_hackathon_data_story_logs.csv" 
#stoty_logs = pd.read_csv(story_logs_path)

#story_texts_path = f"{AUX_DATA_ROOT}/avk_hackathon_data_story_texts.csv"
#story_texts = pd.read_csv(story_texts_path)

## Transactions

In [4]:
transactions = transactions.fillna(0)
transactions

Unnamed: 0,party_rk,account_rk,financial_account_type_cd,transaction_dttm,transaction_type_desc,transaction_amt_rur,merchant_rk,merchant_type,merchant_group_rk,category
0,20337,19666,1,2019-01-01,Покупка,84.00,88676.0,348.0,0.0,Сувениры
1,63404,72991,1,2019-01-01,Покупка,410.00,887248.0,330.0,725.0,Фаст Фуд
2,24789,23517,2,2019-01-01,Покупка,701.44,830014.0,291.0,0.0,Супермаркеты
3,57970,64838,2,2019-01-01,Покупка,6203.70,363834.0,278.0,454.0,Дом/Ремонт
4,12232,11591,2,2019-01-01,Покупка,734.53,85919.0,286.0,878.0,Супермаркеты
...,...,...,...,...,...,...,...,...,...,...
11987612,44990,111641,1,2019-12-31,Покупка,121.00,1940970.0,330.0,675.0,Фаст Фуд
11987613,53023,57593,2,2019-12-31,Покупка,102.29,1198564.0,286.0,0.0,Супермаркеты
11987614,78716,99133,1,2019-12-31,Покупка,175.00,2688766.0,229.0,901.0,Транспорт
11987615,73104,91039,2,2019-12-31,Покупка,185.99,1965313.0,286.0,878.0,Супермаркеты


## FOR FIRST PROBLEM we need to create new target column - conscience_buy. 
### That is percent of conscienceness of spending by client.
### It will be from 0.0 to 1.0
### 1.0 - means the spending is totally fine for customer
### 0.0 - this spending was done unconsciously
### 0.5 - new spending, wasn't in history 
### between 0.0 and 0.5 - unconscience buy
### between 0.5 and 1.0 - quite conscience buy
 
### How to calculate it based on data (transactions) we have?
### Features from which we will construct our target column:

*   mean amount of  money  paid on this category ( **mean_amount_category** (m))= total_sum_in_category/Total number of transactions made by this customer in this category [rubbles]

*   **frequence_of_buying** (f) in this category = total_times_of_buying_in_this_category/days_in_this_month [1/days]

*   **Importance_of_a_product** (i) per client = total amount paid on this category/ total amount of transactions per month [1]

*   **weight_of_category** (w) = Number of transactions of this category/ Total number of transactions. [1]

*   **significant_balance_change_down** (s) (SBCD) = true or false, has the balance changed significantly for the last month [1]


THE RULES ON WHICH WE CONSTRUCT TARGET COLUMN:
1. (m + f + i + w) * s
2. If transaction_type_desc == 'Снятие наличных' then conscience_buy = 1.0

## Balance

In [5]:
balance = balance[:100000]
balance = balance.fillna(0)
balance

Unnamed: 0,party_rk,account_rk,prev_month,cur_month,balance_chng
0,51607,55309,2018-12-31,2019-01-31,15000.0
1,59074,70471,2018-12-31,2019-01-31,0.0
2,33941,33065,2018-12-31,2019-01-31,-15000.0
3,9336,8611,2018-12-31,2019-01-31,-50000.0
4,13958,8421,2018-12-31,2019-01-31,-20000.0
...,...,...,...,...,...
99995,53916,58795,2018-12-31,2019-01-31,-15000.0
99996,56570,62726,2018-12-31,2019-01-31,0.0
99997,30175,28727,2018-12-31,2019-01-31,-5000.0
99998,46416,48188,2018-12-31,2019-01-31,-5000.0


## Products

In [None]:
products

Unnamed: 0,party_rk,product1,product2,product3,product4,product5,product6,product7
0,74874,1,1,0,0,0,0,0
1,83618,0,0,1,1,1,0,0
2,73766,0,0,0,1,1,0,0
3,60218,0,1,0,0,0,0,0
4,47253,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...
49995,52276,0,1,0,0,1,0,0
49996,53681,0,1,0,0,0,0,0
49997,13947,0,1,0,1,1,0,1
49998,94411,0,1,0,0,0,0,0


## Socdem

In [None]:
socdem

Unnamed: 0,party_rk,gender_cd,age,marital_status_desc,children_cnt,region_flg
0,61243,F,70.0,,0,0
1,66535,F,25.0,,0,0
2,83721,M,55.0,Женат/замужем,0,0
3,88238,F,35.0,,0,0
4,57179,F,30.0,Женат/замужем,0,0
...,...,...,...,...,...,...
49995,54994,M,40.0,Женат/замужем,0,0
49996,63391,M,35.0,Холост/не замужем,0,0
49997,5418,F,55.0,Холост/не замужем,0,0
49998,50273,F,35.0,Женат/замужем,0,0


## Stoty_logs

In [None]:
#stoty_logs

## Story_texts

In [None]:
#story_texts

# Stats

## Transactions - count

### Transactions category frequences

In [None]:
transactions.groupby('category').size().sort_values(ascending = False)

category
Супермаркеты          3938925
Фаст Фуд              1007549
Транспорт              839906
Финансовые услуги      787258
Разные товары          531127
Рестораны              401255
Топливо                398397
Аптеки                 311316
Наличные               271948
Дом/Ремонт             268370
Одежда/Обувь           255282
Красота                110076
Сервисные услуги       100588
Развлечения            100322
Связь/Телеком           96478
Автоуслуги              88739
Медицинские услуги      56868
НКО                     56458
Музыка                  54557
Госсборы                50377
Кино                    43165
Аренда авто             39967
Книги                   37376
Спорттовары             35619
Животные                34606
Ж/д билеты              30980
Авиабилеты              30293
Цветы                   29304
Отели                   29273
Частные услуги          24436
Сувениры                16465
Образование              9008
Турагентства             6582
D

### Transactions merchant_type frequences

In [None]:
transactions.groupby('merchant_type').size().sort_values(ascending = False)

merchant_type
286.0    2762127
1.0      1223811
330.0    1007549
291.0     788077
381.0     556687
          ...   
78.0           1
68.0           1
55.0           1
51.0           1
127.0          1
Length: 457, dtype: int64

### Transactions per client per day

In [None]:
transactions.groupby(['party_rk','transaction_dttm']).size().sort_values(ascending = False).head(20)

party_rk  transaction_dttm
17843     2019-09-05          145
4581      2019-02-26          131
5514      2019-10-15          120
31246     2019-11-18           99
30853     2019-03-10           85
5514      2019-10-11           85
22126     2019-09-03           80
73594     2019-04-25           80
16547     2019-02-18           78
45722     2019-11-04           78
57244     2019-01-26           76
65172     2019-01-15           76
31246     2019-11-16           68
57244     2019-02-10           67
          2019-02-08           67
63726     2019-12-10           66
14483     2019-04-27           64
14441     2019-01-30           64
35710     2019-11-06           62
57495     2019-03-11           62
dtype: int64

### transaction_type_desc

In [None]:
list(transactions['transaction_type_desc'].unique())

['Покупка', 'Оплата услуг', 'Платеж', 'Снятие наличных']

### Transactions amount

In [None]:
transactions.sort_values('transaction_amt_rur', ascending = False).head(30)

Unnamed: 0,party_rk,account_rk,financial_account_type_cd,transaction_dttm,transaction_type_desc,transaction_amt_rur,merchant_rk,merchant_type,merchant_group_rk,category
11305825,8864,8064,2,2019-12-19,Платеж,20000100.0,362481.0,1.0,,
10337120,8864,29751,2,2019-11-28,Платеж,12794540.0,825320.0,1.0,,
10280765,8864,29751,2,2019-11-20,Платеж,12754700.0,825320.0,1.0,,
11573153,8864,8064,2,2019-12-20,Оплата услуг,11519300.0,1449700.0,2.0,,
8657969,8864,8064,2,2019-09-30,Платеж,11000100.0,362481.0,1.0,,
2134332,44285,45289,2,2019-02-15,Платеж,11000100.0,825899.0,1.0,,
4310703,8864,29751,2,2019-06-05,Платеж,10751731.0,825320.0,1.0,,
36261,2646,2085,2,2019-01-20,Платеж,9949735.0,1175590.0,1.0,,
6315631,14956,104902,2,2019-08-12,Платеж,9240100.0,826957.0,1.0,,
6296885,14956,109628,2,2019-08-13,Платеж,9200100.0,2301710.0,1.0,,


### Transactions per client per category per day

In [None]:
transactions.groupby(['party_rk','category','transaction_dttm']).size().sort_values(ascending = False).head(20)

party_rk  category           transaction_dttm
4581      Связь/Телеком      2019-02-26          130
5514      Развлечения        2019-10-15          114
31246     Связь/Телеком      2019-11-18           96
30853     Наличные           2019-03-10           85
5514      Развлечения        2019-10-11           84
73594     Наличные           2019-04-25           80
22126     Сервисные услуги   2019-09-03           75
31246     Финансовые услуги  2019-11-16           67
14441     Наличные           2019-01-30           64
35710     Сервисные услуги   2019-11-06           60
75705     Наличные           2019-05-30           59
1308      Развлечения        2019-04-05           59
57495     Финансовые услуги  2019-03-11           58
14483     Финансовые услуги  2019-04-27           58
7605      Музыка             2019-02-27           57
69770     Развлечения        2019-03-13           57
75158     Наличные           2019-04-30           55
65172     Финансовые услуги  2019-01-15           54


### The most used merchant_type by client

In [None]:
transactions.groupby(['party_rk', 'merchant_type']).size()

party_rk  merchant_type
1         1.0              12
          10.0              1
          243.0             1
          244.0             2
          286.0            11
                           ..
94610     335.0             3
          346.0             2
          367.0             1
          491.0             1
          495.0             1
Length: 1200742, dtype: int64

## Client Balance

In [None]:
client_balance_change = balance.groupby('party_rk')['balance_chng'].sum()
client_balance_change

party_rk
1         85000.0
5       -145000.0
7         25000.0
8         95000.0
9             0.0
           ...   
94603     -5000.0
94606     40000.0
94607   -110000.0
94608    -50000.0
94610    -30000.0
Name: balance_chng, Length: 50000, dtype: float64

In [None]:
# check the above code is working well
balance[balance['party_rk'] == 5]

Unnamed: 0,party_rk,account_rk,prev_month,cur_month,balance_chng
48722,5,5,2019-05-31,2019-06-30,115000.0
169246,5,5,2019-09-30,2019-10-31,-95000.0
197634,5,5,2019-06-30,2019-07-31,0.0
222358,5,5,2019-04-30,2019-05-31,-5000.0
233934,5,5,2019-01-31,2019-02-28,-70000.0
245009,5,5,2019-11-30,2019-12-31,40000.0
302528,5,5,2019-03-31,2019-04-30,-60000.0
316864,5,5,2019-07-31,2019-08-31,0.0
323538,5,5,2018-12-31,2019-01-31,10000.0
493120,5,5,2019-08-31,2019-09-30,15000.0


# ADD

In [6]:
new_data = pd.merge(socdem, transactions, left_on='party_rk', right_on='party_rk')
new_data = new_data[:100000]
new_data

Unnamed: 0,party_rk,gender_cd,age,marital_status_desc,children_cnt,region_flg,account_rk,financial_account_type_cd,transaction_dttm,transaction_type_desc,transaction_amt_rur,merchant_rk,merchant_type,merchant_group_rk,category
0,61243,F,70.0,,0,0,75632,2,2019-01-08,Покупка,2125.42329,870570.0,242.0,883.0,Связь/Телеком
1,61243,F,70.0,,0,0,75632,2,2019-01-07,Платеж,2072.81000,1134901.0,1.0,0.0,0
2,61243,F,70.0,,0,0,69865,2,2019-01-06,Покупка,738.35000,18188.0,286.0,878.0,Супермаркеты
3,61243,F,70.0,,0,0,69865,2,2019-01-17,Покупка,751.00000,1162034.0,335.0,1438.0,Аптеки
4,61243,F,70.0,,0,0,69865,2,2019-01-07,Платеж,10100.00000,252998.0,381.0,0.0,Финансовые услуги
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,82963,M,30.0,,0,0,106693,2,2019-10-04,Платеж,1010.00000,2231439.0,381.0,0.0,Финансовые услуги
99996,82963,M,30.0,,0,0,106693,2,2019-09-09,Покупка,234.00000,1288668.0,229.0,901.0,Транспорт
99997,82963,M,30.0,,0,0,106693,2,2019-10-15,Покупка,75.70000,1373184.0,286.0,0.0,Супермаркеты
99998,82963,M,30.0,,0,0,106693,2,2019-10-16,Покупка,219.60000,313965.0,286.0,341.0,Супермаркеты


In [7]:
new_data['transaction_dttm'] = pd.to_datetime(new_data['transaction_dttm'], errors='coerce')
new_data['year']= new_data['transaction_dttm'].dt.year
new_data['month']=new_data['transaction_dttm'].dt.month
new_data['day']=new_data['transaction_dttm'].dt.day
new_data.drop('transaction_dttm',axis='columns', inplace=True)
new_data

Unnamed: 0,party_rk,gender_cd,age,marital_status_desc,children_cnt,region_flg,account_rk,financial_account_type_cd,transaction_type_desc,transaction_amt_rur,merchant_rk,merchant_type,merchant_group_rk,category,year,month,day
0,61243,F,70.0,,0,0,75632,2,Покупка,2125.42329,870570.0,242.0,883.0,Связь/Телеком,2019,1,8
1,61243,F,70.0,,0,0,75632,2,Платеж,2072.81000,1134901.0,1.0,0.0,0,2019,1,7
2,61243,F,70.0,,0,0,69865,2,Покупка,738.35000,18188.0,286.0,878.0,Супермаркеты,2019,1,6
3,61243,F,70.0,,0,0,69865,2,Покупка,751.00000,1162034.0,335.0,1438.0,Аптеки,2019,1,17
4,61243,F,70.0,,0,0,69865,2,Платеж,10100.00000,252998.0,381.0,0.0,Финансовые услуги,2019,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,82963,M,30.0,,0,0,106693,2,Платеж,1010.00000,2231439.0,381.0,0.0,Финансовые услуги,2019,10,4
99996,82963,M,30.0,,0,0,106693,2,Покупка,234.00000,1288668.0,229.0,901.0,Транспорт,2019,9,9
99997,82963,M,30.0,,0,0,106693,2,Покупка,75.70000,1373184.0,286.0,0.0,Супермаркеты,2019,10,15
99998,82963,M,30.0,,0,0,106693,2,Покупка,219.60000,313965.0,286.0,341.0,Супермаркеты,2019,10,16


### Adding first feature

In [8]:
gb = new_data.groupby(['party_rk','category'])
count_df = gb.size().reset_index().rename(columns={0:'count'})
amt_df = new_data.groupby(['party_rk','category'])['transaction_amt_rur'].sum().reset_index().rename(columns={'transaction_amt_rur':'amt_sum'})
amt_df['transactions_cnt'] = count_df['count']
amt_df['mean'] = amt_df['amt_sum'] / count_df['count']
amt_df

Unnamed: 0,party_rk,category,amt_sum,transactions_cnt,mean
0,380,0,5084.39,5,1016.878000
1,380,Медицинские услуги,1742.00,1,1742.000000
2,380,Наличные,3112.00,1,3112.000000
3,380,Разные товары,284.80,1,284.800000
4,380,Супермаркеты,8145.20,16,509.075000
...,...,...,...,...,...
6147,94343,Супермаркеты,5747.97,35,164.227714
6148,94343,Транспорт,7668.00,44,174.272727
6149,94343,Фаст Фуд,2327.00,4,581.750000
6150,94343,Финансовые услуги,34199.00,12,2849.916667


In [9]:
new_data = pd.merge(new_data, amt_df, how = 'left')
# new_data = new_data.drop('amt_sum', axis = 1, inplace = True)
# new_data = new_data.drop('transactions_cnt', axis = 1, inplace = True)
new_data

Unnamed: 0,party_rk,gender_cd,age,marital_status_desc,children_cnt,region_flg,account_rk,financial_account_type_cd,transaction_type_desc,transaction_amt_rur,merchant_rk,merchant_type,merchant_group_rk,category,year,month,day,amt_sum,transactions_cnt,mean
0,61243,F,70.0,,0,0,75632,2,Покупка,2125.42329,870570.0,242.0,883.0,Связь/Телеком,2019,1,8,4.085423e+03,2,2042.711645
1,61243,F,70.0,,0,0,75632,2,Платеж,2072.81000,1134901.0,1.0,0.0,0,2019,1,7,1.560770e+06,39,40019.755128
2,61243,F,70.0,,0,0,69865,2,Покупка,738.35000,18188.0,286.0,878.0,Супермаркеты,2019,1,6,5.198494e+04,167,311.287066
3,61243,F,70.0,,0,0,69865,2,Покупка,751.00000,1162034.0,335.0,1438.0,Аптеки,2019,1,17,2.119716e+04,30,706.572000
4,61243,F,70.0,,0,0,69865,2,Платеж,10100.00000,252998.0,381.0,0.0,Финансовые услуги,2019,1,7,1.010000e+04,1,10100.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,82963,M,30.0,,0,0,106693,2,Платеж,1010.00000,2231439.0,381.0,0.0,Финансовые услуги,2019,10,4,2.042400e+04,9,2269.333333
99996,82963,M,30.0,,0,0,106693,2,Покупка,234.00000,1288668.0,229.0,901.0,Транспорт,2019,9,9,5.720000e+03,36,158.888889
99997,82963,M,30.0,,0,0,106693,2,Покупка,75.70000,1373184.0,286.0,0.0,Супермаркеты,2019,10,15,6.217760e+03,40,155.444000
99998,82963,M,30.0,,0,0,106693,2,Покупка,219.60000,313965.0,286.0,341.0,Супермаркеты,2019,10,16,6.217760e+03,40,155.444000


#### Adding second feature

In [10]:
frequency_df = new_data.groupby(['party_rk','category', 'month', 'year']).size().reset_index().rename(columns={0:'count'})

def get_frequencies(x):
  time = monthrange(x[1], x[2])
  return x[0] / time[1]
  
frequency_df['frequency_of_buying'] = frequency_df[['count', 'year', 'month']].apply(get_frequencies, axis=1)
frequency_df

Unnamed: 0,party_rk,category,month,year,count,frequency_of_buying
0,380,0,9,2019,5,0.166667
1,380,Медицинские услуги,11,2019,1,0.033333
2,380,Наличные,12,2019,1,0.032258
3,380,Разные товары,10,2019,1,0.032258
4,380,Супермаркеты,10,2019,11,0.354839
...,...,...,...,...,...,...
23616,94343,Фаст Фуд,7,2019,2,0.064516
23617,94343,Финансовые услуги,7,2019,8,0.258065
23618,94343,Финансовые услуги,8,2019,3,0.096774
23619,94343,Финансовые услуги,10,2019,1,0.032258


In [11]:
new_data = pd.merge(new_data, frequency_df, how = 'left')
# new_data = new_data.drop('count', inplace = True)
new_data

Unnamed: 0,party_rk,gender_cd,age,marital_status_desc,children_cnt,region_flg,account_rk,financial_account_type_cd,transaction_type_desc,transaction_amt_rur,merchant_rk,merchant_type,merchant_group_rk,category,year,month,day,amt_sum,transactions_cnt,mean,count,frequency_of_buying
0,61243,F,70.0,,0,0,75632,2,Покупка,2125.42329,870570.0,242.0,883.0,Связь/Телеком,2019,1,8,4.085423e+03,2,2042.711645,1,0.032258
1,61243,F,70.0,,0,0,75632,2,Платеж,2072.81000,1134901.0,1.0,0.0,0,2019,1,7,1.560770e+06,39,40019.755128,3,0.096774
2,61243,F,70.0,,0,0,69865,2,Покупка,738.35000,18188.0,286.0,878.0,Супермаркеты,2019,1,6,5.198494e+04,167,311.287066,11,0.354839
3,61243,F,70.0,,0,0,69865,2,Покупка,751.00000,1162034.0,335.0,1438.0,Аптеки,2019,1,17,2.119716e+04,30,706.572000,3,0.096774
4,61243,F,70.0,,0,0,69865,2,Платеж,10100.00000,252998.0,381.0,0.0,Финансовые услуги,2019,1,7,1.010000e+04,1,10100.000000,1,0.032258
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,82963,M,30.0,,0,0,106693,2,Платеж,1010.00000,2231439.0,381.0,0.0,Финансовые услуги,2019,10,4,2.042400e+04,9,2269.333333,1,0.032258
99996,82963,M,30.0,,0,0,106693,2,Покупка,234.00000,1288668.0,229.0,901.0,Транспорт,2019,9,9,5.720000e+03,36,158.888889,8,0.266667
99997,82963,M,30.0,,0,0,106693,2,Покупка,75.70000,1373184.0,286.0,0.0,Супермаркеты,2019,10,15,6.217760e+03,40,155.444000,4,0.129032
99998,82963,M,30.0,,0,0,106693,2,Покупка,219.60000,313965.0,286.0,341.0,Супермаркеты,2019,10,16,6.217760e+03,40,155.444000,4,0.129032


### Adding third feature
#### INNER because some transactions don't have category

In [12]:
#Importance_of_a_product (i) per client = total amount paid on this category/ total amount of transactions per month
amt_category_month_df = new_data.groupby(['party_rk','category', 'month'])['transaction_amt_rur'].sum().reset_index().rename(columns={'transaction_amt_rur':'amt_sum'})
amt_month_df = new_data.groupby(['party_rk', 'month'])['transaction_amt_rur'].sum().reset_index().rename(columns={'transaction_amt_rur':'amt_sum'})
def calculate_category_importance(x):
  rk = x[0]
  month = x[1]
  sum = x[2]
  idx = np.where((amt_month_df['party_rk'] == rk) & (amt_month_df['month'] == month))
  if len(idx) == 0:
    return None
  return sum / amt_month_df.loc[idx[0][0]]['amt_sum']

amt_category_month_df['category_importance'] = amt_category_month_df[['party_rk', 'month', 'amt_sum']].apply(calculate_category_importance, axis=1)
amt_category_month_df 

Unnamed: 0,party_rk,category,month,amt_sum,category_importance
0,380,0,9,5084.39,0.032656
1,380,Медицинские услуги,11,1742.00,0.005531
2,380,Наличные,12,3112.00,0.311549
3,380,Разные товары,10,284.80,0.000949
4,380,Супермаркеты,10,4025.56,0.013412
...,...,...,...,...,...
23616,94343,Фаст Фуд,7,1202.00,0.023847
23617,94343,Финансовые услуги,7,22093.00,0.438304
23618,94343,Финансовые услуги,8,10051.00,0.414528
23619,94343,Финансовые услуги,10,2055.00,0.073134


In [13]:
new_data = pd.merge(new_data, amt_category_month_df, how = 'inner')
new_data

Unnamed: 0,party_rk,gender_cd,age,marital_status_desc,children_cnt,region_flg,account_rk,financial_account_type_cd,transaction_type_desc,transaction_amt_rur,merchant_rk,merchant_type,merchant_group_rk,category,year,month,day,amt_sum,transactions_cnt,mean,count,frequency_of_buying,category_importance
0,61243,F,70.0,,0,0,69865,2,Платеж,10100.00000,252998.0,381.0,0.0,Финансовые услуги,2019,1,7,10100.00000,1,10100.00000,1,0.032258,0.128469
1,61243,F,70.0,,0,0,75632,2,Покупка,8126.39153,818173.0,355.0,656.0,Сервисные услуги,2019,5,13,8126.39153,1,8126.39153,1,0.032258,0.099936
2,61243,F,70.0,,0,0,69865,2,Покупка,817.00000,896118.0,311.0,0.0,Одежда/Обувь,2019,6,21,817.00000,1,817.00000,1,0.033333,0.019376
3,61243,F,70.0,,0,0,69865,2,Покупка,615.00000,159749.0,226.0,978.0,Транспорт,2019,12,22,615.00000,1,615.00000,1,0.032258,0.000994
4,66535,F,25.0,,0,0,77868,2,Покупка,1523.00000,997845.0,450.0,0.0,Развлечения,2019,3,19,4537.00000,2,2268.50000,2,0.064516,0.092468
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3333,82963,M,30.0,,0,0,106693,2,Покупка,438.00000,1927158.0,447.0,0.0,Кино,2019,7,31,438.00000,1,438.00000,1,0.032258,0.059373
3334,82963,M,30.0,,0,0,106693,2,Покупка,261.00000,76860.0,328.0,0.0,Рестораны,2019,7,27,261.00000,1,261.00000,1,0.032258,0.035380
3335,82963,M,30.0,,0,0,106693,2,Покупка,139.00000,2158688.0,411.0,0.0,Частные услуги,2019,8,16,139.00000,1,139.00000,1,0.032258,0.004518
3336,82963,M,30.0,,0,0,106693,2,Покупка,1482.62000,562646.0,285.0,656.0,Разные товары,2019,9,20,1482.62000,1,1482.62000,1,0.033333,0.044016


### Adding fourth feature

In [14]:
#weight_of_category (w) = Number of transactions of this category/ Total number of transactions
def category_weight(x):
  rk = x[0]
  count = x[1]
  total = len(new_data.query("party_rk == " + str(rk)))
  return count / total
  
count_df['category_weight'] = count_df[['party_rk', 'count']].apply(category_weight, axis=1)
count_df.head()

  


Unnamed: 0,party_rk,category,count,category_weight
0,380,0,5,0.555556
1,380,Медицинские услуги,1,0.111111
2,380,Наличные,1,0.111111
3,380,Разные товары,1,0.111111
4,380,Супермаркеты,16,1.777778


In [15]:
new_data = pd.merge(new_data, count_df, how = 'inner')
new_data

Unnamed: 0,party_rk,gender_cd,age,marital_status_desc,children_cnt,region_flg,account_rk,financial_account_type_cd,transaction_type_desc,transaction_amt_rur,merchant_rk,merchant_type,merchant_group_rk,category,year,month,day,amt_sum,transactions_cnt,mean,count,frequency_of_buying,category_importance,category_weight
0,61243,F,70.0,,0,0,69865,2,Платеж,10100.00000,252998.0,381.0,0.0,Финансовые услуги,2019,1,7,10100.00000,1,10100.00000,1,0.032258,0.128469,0.250000
1,61243,F,70.0,,0,0,75632,2,Покупка,8126.39153,818173.0,355.0,656.0,Сервисные услуги,2019,5,13,8126.39153,1,8126.39153,1,0.032258,0.099936,0.250000
2,61243,F,70.0,,0,0,69865,2,Покупка,817.00000,896118.0,311.0,0.0,Одежда/Обувь,2019,6,21,817.00000,1,817.00000,1,0.033333,0.019376,0.250000
3,61243,F,70.0,,0,0,69865,2,Покупка,615.00000,159749.0,226.0,978.0,Транспорт,2019,12,22,615.00000,1,615.00000,1,0.032258,0.000994,0.250000
4,66535,F,25.0,,0,0,77868,2,Покупка,1523.00000,997845.0,450.0,0.0,Развлечения,2019,3,19,4537.00000,2,2268.50000,2,0.064516,0.092468,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3333,82963,M,30.0,,0,0,106693,2,Покупка,438.00000,1927158.0,447.0,0.0,Кино,2019,7,31,438.00000,1,438.00000,1,0.032258,0.059373,0.083333
3334,82963,M,30.0,,0,0,106693,2,Покупка,261.00000,76860.0,328.0,0.0,Рестораны,2019,7,27,261.00000,1,261.00000,1,0.032258,0.035380,0.083333
3335,82963,M,30.0,,0,0,106693,2,Покупка,139.00000,2158688.0,411.0,0.0,Частные услуги,2019,8,16,139.00000,1,139.00000,1,0.032258,0.004518,0.083333
3336,82963,M,30.0,,0,0,106693,2,Покупка,1482.62000,562646.0,285.0,656.0,Разные товары,2019,9,20,1482.62000,1,1482.62000,1,0.033333,0.044016,0.083333


## Adding fifth feature

In [16]:
#significant_balance_change_down (s) (SBCD) = true or false, has the balance changed significantly for the last month
def calculate_SBCD(x):
  rk = x[0]
  chng = x[1]
  if chng >= 0:
    return False
  mean = balance[(balance['party_rk'] == rk) & (balance['balance_chng']<0)].balance_chng.mean()
  std = balance[(balance['party_rk'] == rk) & (balance['balance_chng']<0)].balance_chng.std()
  if chng < mean + std :
    return True
  return False

balance['SBCD'] = balance[['party_rk', 'balance_chng']].apply(calculate_SBCD, axis=1)
balance

Unnamed: 0,party_rk,account_rk,prev_month,cur_month,balance_chng,SBCD
0,51607,55309,2018-12-31,2019-01-31,15000.0,False
1,59074,70471,2018-12-31,2019-01-31,0.0,False
2,33941,33065,2018-12-31,2019-01-31,-15000.0,False
3,9336,8611,2018-12-31,2019-01-31,-50000.0,True
4,13958,8421,2018-12-31,2019-01-31,-20000.0,False
...,...,...,...,...,...,...
99995,53916,58795,2018-12-31,2019-01-31,-15000.0,False
99996,56570,62726,2018-12-31,2019-01-31,0.0,False
99997,30175,28727,2018-12-31,2019-01-31,-5000.0,False
99998,46416,48188,2018-12-31,2019-01-31,-5000.0,True


In [17]:
new_data = pd.merge(new_data, balance, how = 'left')
new_data

Unnamed: 0,party_rk,gender_cd,age,marital_status_desc,children_cnt,region_flg,account_rk,financial_account_type_cd,transaction_type_desc,transaction_amt_rur,merchant_rk,merchant_type,merchant_group_rk,category,year,month,day,amt_sum,transactions_cnt,mean,count,frequency_of_buying,category_importance,category_weight,prev_month,cur_month,balance_chng,SBCD
0,61243,F,70.0,,0,0,69865,2,Платеж,10100.00000,252998.0,381.0,0.0,Финансовые услуги,2019,1,7,10100.00000,1,10100.00000,1,0.032258,0.128469,0.250000,2019-05-31,2019-06-30,-215000.0,False
1,61243,F,70.0,,0,0,69865,2,Платеж,10100.00000,252998.0,381.0,0.0,Финансовые услуги,2019,1,7,10100.00000,1,10100.00000,1,0.032258,0.128469,0.250000,2019-01-31,2019-02-28,5000.0,False
2,61243,F,70.0,,0,0,69865,2,Платеж,10100.00000,252998.0,381.0,0.0,Финансовые услуги,2019,1,7,10100.00000,1,10100.00000,1,0.032258,0.128469,0.250000,2018-12-31,2019-01-31,10000.0,False
3,61243,F,70.0,,0,0,75632,2,Покупка,8126.39153,818173.0,355.0,656.0,Сервисные услуги,2019,5,13,8126.39153,1,8126.39153,1,0.032258,0.099936,0.250000,2019-01-31,2019-02-28,0.0,False
4,61243,F,70.0,,0,0,75632,2,Покупка,8126.39153,818173.0,355.0,656.0,Сервисные услуги,2019,5,13,8126.39153,1,8126.39153,1,0.032258,0.099936,0.250000,2019-11-30,2019-12-31,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5638,82963,M,30.0,,0,0,106693,2,Покупка,438.00000,1927158.0,447.0,0.0,Кино,2019,7,31,438.00000,1,438.00000,1,0.032258,0.059373,0.083333,2019-09-30,2019-10-31,0.0,False
5639,82963,M,30.0,,0,0,106693,2,Покупка,261.00000,76860.0,328.0,0.0,Рестораны,2019,7,27,261.00000,1,261.00000,1,0.032258,0.035380,0.083333,2019-09-30,2019-10-31,0.0,False
5640,82963,M,30.0,,0,0,106693,2,Покупка,139.00000,2158688.0,411.0,0.0,Частные услуги,2019,8,16,139.00000,1,139.00000,1,0.032258,0.004518,0.083333,2019-09-30,2019-10-31,0.0,False
5641,82963,M,30.0,,0,0,106693,2,Покупка,1482.62000,562646.0,285.0,656.0,Разные товары,2019,9,20,1482.62000,1,1482.62000,1,0.033333,0.044016,0.083333,2019-09-30,2019-10-31,0.0,False


In [18]:
#one hot encode the string columns. 
new_data=pd.concat([new_data,pd.get_dummies(new_data['category'], prefix='category')],axis=1)
new_data.drop(['category'],axis=1, inplace=True)
new_data=pd.concat([new_data,pd.get_dummies(new_data['marital_status_desc'], prefix='MS:')],axis=1)
new_data.drop(['marital_status_desc'],axis=1, inplace=True)
new_data=pd.concat([new_data,pd.get_dummies(new_data['transaction_type_desc'], prefix='MS:')],axis=1)
new_data.drop(['transaction_type_desc'],axis=1, inplace=True)
new_data

Unnamed: 0,party_rk,gender_cd,age,children_cnt,region_flg,account_rk,financial_account_type_cd,transaction_amt_rur,merchant_rk,merchant_type,merchant_group_rk,year,month,day,amt_sum,transactions_cnt,mean,count,frequency_of_buying,category_importance,category_weight,prev_month,cur_month,balance_chng,SBCD,category_0,category_Duty Free,category_Авиабилеты,category_Автоуслуги,category_Аптеки,category_Аренда авто,category_Госсборы,category_Дом/Ремонт,category_Ж/д билеты,category_Животные,category_Искусство,category_Кино,category_Книги,category_Красота,category_Медицинские услуги,category_Музыка,category_НКО,category_Наличные,category_Образование,category_Одежда/Обувь,category_Отели,category_Развлечения,category_Разные товары,category_Рестораны,category_Связь/Телеком,category_Сервисные услуги,category_Спорттовары,category_Сувениры,category_Супермаркеты,category_Топливо,category_Транспорт,category_Турагентства,category_Фаст Фуд,category_Финансовые услуги,category_Фото/Видео,category_Цветы,category_Частные услуги,"MS:_Вдовец, вдова",MS:_Гражданский брак,MS:_Женат/замужем,MS:_Разведен (а),MS:_Холост/не замужем,MS:_Оплата услуг,MS:_Платеж,MS:_Покупка,MS:_Снятие наличных
0,61243,F,70.0,0,0,69865,2,10100.00000,252998.0,381.0,0.0,2019,1,7,10100.00000,1,10100.00000,1,0.032258,0.128469,0.250000,2019-05-31,2019-06-30,-215000.0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
1,61243,F,70.0,0,0,69865,2,10100.00000,252998.0,381.0,0.0,2019,1,7,10100.00000,1,10100.00000,1,0.032258,0.128469,0.250000,2019-01-31,2019-02-28,5000.0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
2,61243,F,70.0,0,0,69865,2,10100.00000,252998.0,381.0,0.0,2019,1,7,10100.00000,1,10100.00000,1,0.032258,0.128469,0.250000,2018-12-31,2019-01-31,10000.0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
3,61243,F,70.0,0,0,75632,2,8126.39153,818173.0,355.0,656.0,2019,5,13,8126.39153,1,8126.39153,1,0.032258,0.099936,0.250000,2019-01-31,2019-02-28,0.0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,61243,F,70.0,0,0,75632,2,8126.39153,818173.0,355.0,656.0,2019,5,13,8126.39153,1,8126.39153,1,0.032258,0.099936,0.250000,2019-11-30,2019-12-31,0.0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5638,82963,M,30.0,0,0,106693,2,438.00000,1927158.0,447.0,0.0,2019,7,31,438.00000,1,438.00000,1,0.032258,0.059373,0.083333,2019-09-30,2019-10-31,0.0,False,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
5639,82963,M,30.0,0,0,106693,2,261.00000,76860.0,328.0,0.0,2019,7,27,261.00000,1,261.00000,1,0.032258,0.035380,0.083333,2019-09-30,2019-10-31,0.0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
5640,82963,M,30.0,0,0,106693,2,139.00000,2158688.0,411.0,0.0,2019,8,16,139.00000,1,139.00000,1,0.032258,0.004518,0.083333,2019-09-30,2019-10-31,0.0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
5641,82963,M,30.0,0,0,106693,2,1482.62000,562646.0,285.0,656.0,2019,9,20,1482.62000,1,1482.62000,1,0.033333,0.044016,0.083333,2019-09-30,2019-10-31,0.0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [79]:
new_data=pd.concat([new_data,pd.get_dummies(new_data['gender_cd'], prefix='gender_cd')],axis=1)
new_data.drop(['gender_cd'],axis=1, inplace=True)

In [19]:
new_data.drop('amt_sum', axis =1, inplace = True)
new_data.drop('transactions_cnt', axis =1, inplace = True)
new_data.drop('count', axis =1, inplace = True)
new_data.drop('prev_month', axis =1, inplace = True)
new_data.drop('cur_month', axis =1, inplace = True)
new_data.drop('balance_chng', axis =1, inplace = True)

In [20]:
new_data

Unnamed: 0,party_rk,gender_cd,age,children_cnt,region_flg,account_rk,financial_account_type_cd,transaction_amt_rur,merchant_rk,merchant_type,merchant_group_rk,year,month,day,mean,frequency_of_buying,category_importance,category_weight,SBCD,category_0,category_Duty Free,category_Авиабилеты,category_Автоуслуги,category_Аптеки,category_Аренда авто,category_Госсборы,category_Дом/Ремонт,category_Ж/д билеты,category_Животные,category_Искусство,category_Кино,category_Книги,category_Красота,category_Медицинские услуги,category_Музыка,category_НКО,category_Наличные,category_Образование,category_Одежда/Обувь,category_Отели,category_Развлечения,category_Разные товары,category_Рестораны,category_Связь/Телеком,category_Сервисные услуги,category_Спорттовары,category_Сувениры,category_Супермаркеты,category_Топливо,category_Транспорт,category_Турагентства,category_Фаст Фуд,category_Финансовые услуги,category_Фото/Видео,category_Цветы,category_Частные услуги,"MS:_Вдовец, вдова",MS:_Гражданский брак,MS:_Женат/замужем,MS:_Разведен (а),MS:_Холост/не замужем,MS:_Оплата услуг,MS:_Платеж,MS:_Покупка,MS:_Снятие наличных
0,61243,F,70.0,0,0,69865,2,10100.00000,252998.0,381.0,0.0,2019,1,7,10100.00000,0.032258,0.128469,0.250000,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
1,61243,F,70.0,0,0,69865,2,10100.00000,252998.0,381.0,0.0,2019,1,7,10100.00000,0.032258,0.128469,0.250000,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
2,61243,F,70.0,0,0,69865,2,10100.00000,252998.0,381.0,0.0,2019,1,7,10100.00000,0.032258,0.128469,0.250000,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
3,61243,F,70.0,0,0,75632,2,8126.39153,818173.0,355.0,656.0,2019,5,13,8126.39153,0.032258,0.099936,0.250000,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,61243,F,70.0,0,0,75632,2,8126.39153,818173.0,355.0,656.0,2019,5,13,8126.39153,0.032258,0.099936,0.250000,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5638,82963,M,30.0,0,0,106693,2,438.00000,1927158.0,447.0,0.0,2019,7,31,438.00000,0.032258,0.059373,0.083333,False,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
5639,82963,M,30.0,0,0,106693,2,261.00000,76860.0,328.0,0.0,2019,7,27,261.00000,0.032258,0.035380,0.083333,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
5640,82963,M,30.0,0,0,106693,2,139.00000,2158688.0,411.0,0.0,2019,8,16,139.00000,0.032258,0.004518,0.083333,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
5641,82963,M,30.0,0,0,106693,2,1482.62000,562646.0,285.0,656.0,2019,9,20,1482.62000,0.033333,0.044016,0.083333,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


### Adding target

In [21]:
new_data['category_importance'] = (new_data['category_importance'] - new_data['category_importance'].mean() )/ new_data['category_importance'].std()
new_data['category_weight'] = (new_data['category_weight'] - new_data['category_weight'].mean() )/ new_data['category_weight'].std()
new_data['mean'] = (new_data['mean'] - new_data['mean'].mean() )/ new_data['mean'].std()
new_data['frequency_of_buying'] = (new_data['frequency_of_buying'] - new_data['frequency_of_buying'].mean() )/ new_data['frequency_of_buying'].std()

In [59]:
new_data['category_importance'] = abs(new_data['category_importance'])

In [60]:
new_data['category_weight'] = abs(new_data['category_weight'])

In [61]:
s_t = new_data['mean'] + new_data['frequency_of_buying'] + new_data['category_weight'] + new_data['category_weight']
s_t = abs(s_t.mean() - s_t.std())
s_t

0.6272154987457879

In [62]:
d = new_data['mean'] + new_data['frequency_of_buying'] + new_data['category_weight'] + new_data['category_importance']
d

0       0.425816
1       0.425816
2       0.425816
3       0.233829
4       0.233829
          ...   
5638    0.611675
5639    0.723418
5640    0.876979
5641    0.794898
5642    0.661339
Length: 5643, dtype: float64

In [64]:
mask_1 = ((new_data['SBCD'] == 0) & (d < s_t)) | ((new_data['SBCD'] == 0) & (d > s_t))
new_data['conscience_buy'] = np.where(mask_1, d*new_data['SBCD'], d + new_data['SBCD'])

In [67]:
new_data['conscience_buy'] = (new_data['conscience_buy'] - new_data['conscience_buy'].mean()) / new_data['conscience_buy'].std()

In [69]:
new_data['conscience_buy'] = abs(new_data['conscience_buy'])

In [74]:
new_data

Unnamed: 0,party_rk,gender_cd,age,children_cnt,region_flg,account_rk,financial_account_type_cd,transaction_amt_rur,merchant_rk,merchant_type,merchant_group_rk,year,month,day,mean,frequency_of_buying,category_importance,category_weight,SBCD,category_0,category_Duty Free,category_Авиабилеты,category_Автоуслуги,category_Аптеки,category_Аренда авто,category_Госсборы,category_Дом/Ремонт,category_Ж/д билеты,category_Животные,category_Искусство,category_Кино,category_Книги,category_Красота,category_Медицинские услуги,category_Музыка,category_НКО,category_Наличные,category_Образование,category_Одежда/Обувь,category_Отели,category_Развлечения,category_Разные товары,category_Рестораны,category_Связь/Телеком,category_Сервисные услуги,category_Спорттовары,category_Сувениры,category_Супермаркеты,category_Топливо,category_Транспорт,category_Турагентства,category_Фаст Фуд,category_Финансовые услуги,category_Фото/Видео,category_Цветы,category_Частные услуги,"MS:_Вдовец, вдова",MS:_Гражданский брак,MS:_Женат/замужем,MS:_Разведен (а),MS:_Холост/не замужем,MS:_Оплата услуг,MS:_Платеж,MS:_Покупка,MS:_Снятие наличных,conscience_buy
0,61243,F,70.0,0,0,69865,2,10100.00000,252998.0,381.0,0.0,2019,1,7,0.642591,-0.394291,0.080438,0.097078,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0.217888
1,61243,F,70.0,0,0,69865,2,10100.00000,252998.0,381.0,0.0,2019,1,7,0.642591,-0.394291,0.080438,0.097078,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0.217888
2,61243,F,70.0,0,0,69865,2,10100.00000,252998.0,381.0,0.0,2019,1,7,0.642591,-0.394291,0.080438,0.097078,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0.217888
3,61243,F,70.0,0,0,75632,2,8126.39153,818173.0,355.0,656.0,2019,5,13,0.459012,-0.394291,0.072029,0.097078,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.217888
4,61243,F,70.0,0,0,75632,2,8126.39153,818173.0,355.0,656.0,2019,5,13,0.459012,-0.394291,0.072029,0.097078,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.217888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5638,82963,M,30.0,0,0,106693,2,438.00000,1927158.0,447.0,0.0,2019,7,31,-0.256135,-0.394291,0.288772,0.973328,False,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.217888
5639,82963,M,30.0,0,0,106693,2,261.00000,76860.0,328.0,0.0,2019,7,27,-0.272599,-0.394291,0.416979,0.973328,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.217888
5640,82963,M,30.0,0,0,106693,2,139.00000,2158688.0,411.0,0.0,2019,8,16,-0.283947,-0.394291,0.581888,0.973328,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0.217888
5641,82963,M,30.0,0,0,106693,2,1482.62000,562646.0,285.0,656.0,2019,9,20,-0.158968,-0.390296,0.370834,0.973328,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.217888


# Training

In [80]:
X, y = new_data.drop('conscience_buy', axis = 1), new_data['conscience_buy']

In [81]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)

# Model

In [85]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=20, random_state=0)
clf.fit(X_train, y_train)

ValueError: ignored