# Hypothesis 5

## The code up to cell `5. All your experiments going below` must be run

0. Rename this notebook to correct name `5-Name_Surname.ipynb`
1. Complete the list of best features for classification and regression models separately (all original features are not appropriate)
2. Fill your best models (classification and regression) and improve `alg1`
    1. Models must be with hyper parameters (models with default hyper parameters are not appropriate)
    2. `alg1` function should be improved (the original version isn't valid)
3. Calculate quality metrics 
    1. Get metrics above Minimum requirements
4. Prepare submission
    1. Send your submission file and notebook
5. All your experiments going below
    1. Add all your experiments and code
    2. Add comments and metrics

## Notes:

- You can get up to 2 point
- Deadline 1 June 23.59

## Minimum requirements for submission
- churn_auc >= 0.92
- price_nmsle >= -0.142
- issue_amount >= 22000
- total_profit >= 16000
- %bad_loans <= 1.4

In [1]:
import pandas as pd
import numpy as np

%matplotlib inline

pd.set_option('mode.chained_assignment', None)
pd.options.display.max_rows = 100

In [8]:
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_absolute_percentage_error


def calc_all_metrics(data, max_account=25e3):
    
    def is_credit_issued(x):
        ratio = x['__price_predict'] / x['__price_doc']
        if x['__priority'] <= 0:
            value = 0.  
        elif ratio > 0.9 and ratio < 1.:
            value = x['__price_predict']
        elif ratio >= 1. and ratio < 1.1:
            value = x['__price_doc']
        else:
            value = 0.

        return value

    def calc_profit(x):
        if x['is_credit'] == 0.:
            return 0.
        elif x['__churn'] == 1:
            return - x['debt'] * 2.
        elif x['debt'] < 5:
            return x['debt'] * 0.3
        elif x['debt'] < 9:
            return x['debt'] * 0.4
        elif x['debt'] >= 9:
            return x['debt'] * 0.5

    s = (
        data
        [['__priority', '__churn', '__churn_prob', '__price_doc', '__price_predict']]
        .sort_values('__priority', ascending=False)
        .copy(True)
    )
    
    s['debt'] = s.apply(is_credit_issued, axis=1)
    s['debt_cum'] = s['debt'].cumsum()
    s['is_credit'] = 0
    s.loc[(s['debt'] > 0) & (s['debt_cum'] <= max_account), 'is_credit'] = 1
    s['profit'] = s.apply(calc_profit, axis=1)
    
    total_profit = round(s['profit'].sum(), 2)
    good_credits_count = s['is_credit'].sum()
    good_credits_debt = round(s[s['is_credit'] == 1]['debt'].sum(), 2)
    bad_credits_count = s[s['is_credit'] == 1]['__churn'].sum()
    bad_credits_losses = s[(s['is_credit'] == 1) & (s['__churn'] == 1)]['debt'].sum()
    
    return {
        'total_profit': total_profit,
        '%profit_issued': round(total_profit / good_credits_debt * 100, 1),
        '%issued_loans': round(good_credits_debt / max_account * 100, 2),
        'issued_loans': good_credits_debt,
        'count_good': good_credits_count,
        'count_bad': bad_credits_count,
        '%bad': round(bad_credits_count / (good_credits_count + bad_credits_count) * 100., 1),
        'churn_auc': round(roc_auc_score(y_true=s['__churn'], y_score=s['__churn_prob']), 3),
        'price_nmsle': round(-mean_squared_log_error(y_true=s['__price_doc'], y_pred=s['__price_predict']), 3),
        'price_mape': round(-mean_absolute_percentage_error(y_true=s['__price_doc'], y_pred=s['__price_predict']), 3),
    }


In [9]:
train_raw = pd.read_csv('train_corr.csv')
submission = pd.read_csv('test_corr.csv')

print(train_raw.shape, submission.shape)

(13953, 147) (9988, 145)


# 1. Fill your top features

In [432]:
reg_features = list(reg_imp.head(30).index) # your top features for regression model
clf_features = list(reg_imp_class.head(25).index) # your top features for classification model

print(len(reg_features), reg_features)
print(len(clf_features), clf_features)

30 ['f__full_sq', 'f__sport_count_2000', 'f__office_sqm_5000', 'f__trc_count_2000', 'f__metro_km_walk', 'f__cafe_count_1000_price_1000', 'f__railroad_station_walk_km', 'f__floor', 'f__mkad_km', 'f__life_sq', 'f__detention_facility_km', 'f__cafe_sum_1000_min_price_avg', 'f__public_transport_station_min_walk', 'f__total_amt_chng_q4_q1', 'f__cafe_sum_1500_min_price_avg', 'f__total_ct_chng_q4_q1', 'f__total_trans_amt', 'f__total_revolving_bal', 'f__total_trans_ct', 'f__0_17_all', 'f__credit_limit', 'f__avg_utilization_ratio', 'f__avg_open_to_buy', 'f__build_year', 'f__max_floor', 'f__customer_age', 'f__leisure_count_5000', 'f__months_on_book', 'f__leisure_count_3000', 'f__build_count_wood']
25 ['f__contacts_count_12_mon', 'f__total_ct_chng_q4_q1', 'f__total_trans_ct', 'f__total_relationship_count', 'f__total_trans_amt', 'f__total_revolving_bal', 'f__months_inactive_12_mon', 'f__total_amt_chng_q4_q1', 'f__avg_utilization_ratio', 'f__credit_limit', 'f__income_category_Unknown', 'f__avg_open_

# 2. Fill your best models and improve alg1

In [427]:
#pr=train_raw['__price_predict']
#prob=train_raw['__churn_prob']

def alg1(z,tg=1,shift=0):
    XX=z[['__churn_prob','__price_predict']]
    XX[['__churn_prob','__price_predict']]=MinMaxScaler().fit_transform(XX)
    XX['lbl']=0
    for i in range(len(XX)):
        x=XX['__churn_prob'].iloc[i]
        y=XX['__price_predict'].iloc[i]
        XX['lbl'].iloc[i] = (y)**2+tg*(x-1)**2
    
    return XX['lbl']-shift
    
    
reg = RandomForestRegressor(max_depth=50, random_state=73, criterion="squared_error") # your best regression model with hyper parameters
clf = GradientBoostingClassifier(n_estimators=100, max_depth=5,learning_rate=0.05) # your best classification model with hyper parameters

In [433]:
# the code below get your unbias estimations of quality metrics
from sklearn.model_selection import cross_val_predict


X = train_raw.fillna(0).copy(True)
X_sub = submission.fillna(0).copy(True)

train_raw['__price_predict'] = cross_val_predict(
    estimator=reg,
    X=X[reg_features],
    y=train_raw['__price_doc'],
    cv=5,
    method='predict')
train_raw['__price_predict'] = np.maximum(0, train_raw['__price_predict'])

train_raw['__churn_prob'] = cross_val_predict(
    estimator=clf,
    X=X[clf_features],
    y=train_raw['__churn'],
    cv=5,
    method='predict_proba')[:, 1]


# 3. Calculate quality metrics

In [434]:
from sklearn.model_selection import train_test_split

In [435]:
train_raw['__priority'] = alg1(train_raw,0.2,0.2)

complex_stratify = (
    pd.qcut(train_raw['__price_doc'], q=10).astype(str)
    + train_raw['__churn'].astype(str))

test1, test2 = train_test_split(train_raw,
                                test_size=0.5,
                                stratify=complex_stratify,
                                random_state=47)

metrics = pd.DataFrame(data=[calc_all_metrics(test1),
                             calc_all_metrics(test2)],
                       index=['test1', 'test2']).T
metrics

Unnamed: 0,test1,test2
total_profit,4601.46,4226.26
%profit_issued,38.3,35.7
%issued_loans,48.11,47.39
issued_loans,12026.84,11847.02
count_good,1904.0,1888.0
count_bad,10.0,21.0
%bad,0.5,1.1
churn_auc,0.968,0.963
price_nmsle,-0.139,-0.14
price_mape,-0.472,-0.464



## Minimum requirements for submission

- churn_auc >= 0.92
- price_nmsle >= -0.142
- total_profit >= 4000
- %bad_loans <= 1.4

# 4. Prepare submission

In [438]:
reg.fit(X[reg_features], train_raw['__price_doc'])
submission['__price_predict'] = np.maximum(0, reg.predict(X_sub[reg_features]))

clf.fit(X[clf_features], train_raw['__churn'])
submission['__churn_prob'] = clf.predict_proba(X_sub[clf_features])[:, 1]

submission['__priority'] = alg1(submission,0.2,0.2)

In [439]:
final_score = submission[['__price_predict', '__churn_prob', '__priority']]
print(final_score.columns.tolist())
print(final_score.shape)

assert final_score.shape == (9988, 3)
assert final_score.columns.tolist() == ['__price_predict', '__churn_prob', '__priority']
assert final_score['__price_predict'].min() >= 0.
assert len(final_score['__churn_prob'].unique()) > 2

final_score.to_csv('5-Mariia_Dmitrieva.csv', index=False)
final_score.head()

['__price_predict', '__churn_prob', '__priority']
(9988, 3)


Unnamed: 0,__price_predict,__churn_prob,__priority
0,7.49526,0.910507,-0.163964
1,4.776587,0.003734,0.008708
2,5.52807,0.010365,0.011491
3,4.632045,0.003286,0.007984
4,17.406994,0.004387,0.272168













# 5. All your experiments going below

In [23]:
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression

RANDOM_STATE = 47

train, test = train_test_split(train_raw, test_size=0.5, random_state=RANDOM_STATE)
train.shape, test.shape, submission.shape

((6976, 147), (6977, 147), (9988, 145))

In [25]:
final_features = [c for c in train_raw.columns if c.startswith('f_')]
tmp = (train_raw[final_features].dtypes != 'object')
final_features = tmp[tmp == True].index.tolist()
final_features

['f__max_floor',
 'f__state',
 'f__total_revolving_bal',
 'f__market_count_1500',
 'f__leisure_count_3000',
 'f__total_ct_chng_q4_q1',
 'f__railroad_station_walk_km',
 'f__contacts_count_12_mon',
 'f__0_17_all',
 'f__trc_count_2000',
 'f__build_count_wood',
 'f__credit_limit',
 'f__total_trans_ct',
 'f__leisure_count_5000',
 'f__life_sq',
 'f__cafe_count_1000_price_1000',
 'f__mkad_km',
 'f__school_education_centers_top_20_raion',
 'f__avg_utilization_ratio',
 'f__public_transport_station_min_walk',
 'f__customer_age',
 'f__detention_facility_km',
 'f__sport_count_2000',
 'f__cafe_sum_1000_min_price_avg',
 'f__total_amt_chng_q4_q1',
 'f__metro_km_walk',
 'f__office_sqm_5000',
 'f__total_trans_amt',
 'f__months_inactive_12_mon',
 'f__cafe_sum_1500_min_price_avg',
 'f__floor',
 'f__num_room',
 'f__months_on_book',
 'f__dependent_count',
 'f__avg_open_to_buy',
 'f__build_year',
 'f__full_sq',
 'f__total_relationship_count',
 'f__build_count_mix',
 'f__marital_status_Married',
 'f__marital

In [26]:
X_train = train[final_features].fillna(0).copy(True)
X_test = test[final_features].fillna(0).copy(True)
X_sub = submission[final_features].fillna(0).copy(True)

# list in which we store a set of different datasets for training and evaluating the model
# (dataset name, training set of features, deferred set of features)
# the training dataset should come first
X_y_datasets = [
    ('train', X_train, train),
    ('test', X_test, test),
    ('submission', X_sub, submission)]

X_train.shape, X_test.shape, X_sub.shape

((6976, 86), (6977, 86), (9988, 86))

In [27]:
reg_model = RandomForestRegressor(max_depth=50, random_state=73, criterion="squared_error")
reg_model.fit(X_train, train['__price_doc'])

In [28]:
reg_imp = (
    pd.Series(reg_model.feature_importances_, index=X_train.columns)
    .to_frame(name='imp')
    .round(4)
)
# reg_imp['abs'] = reg_imp['imp'].abs()
reg_imp.sort_values('imp', ascending=False, inplace=True)
# reg_imp.drop(['abs'], inplace=True, axis=1)

reg_imp

Unnamed: 0,imp
f__full_sq,0.3918
f__sport_count_2000,0.1001
f__office_sqm_5000,0.0631
f__trc_count_2000,0.0372
f__metro_km_walk,0.021
f__cafe_count_1000_price_1000,0.0206
f__railroad_station_walk_km,0.0196
f__floor,0.0188
f__mkad_km,0.0178
f__life_sq,0.0175


In [29]:
from sklearn.ensemble import GradientBoostingClassifier

In [30]:
class_model = GradientBoostingClassifier(n_estimators=100, max_depth=5,learning_rate=0.05)
class_model.fit(X_train, train['__churn'])

In [31]:
reg_imp_class = (
    pd.Series(class_model.feature_importances_, index=X_train.columns)
    .to_frame(name='imp')
    .round(4)
)
# reg_imp['abs'] = reg_imp['imp'].abs()
reg_imp_class.sort_values('imp', ascending=False, inplace=True)
# reg_imp.drop(['abs'], inplace=True, axis=1)

reg_imp_class

Unnamed: 0,imp
f__contacts_count_12_mon,0.4216
f__total_ct_chng_q4_q1,0.1741
f__total_trans_ct,0.0691
f__total_relationship_count,0.067
f__total_trans_amt,0.0514
f__total_revolving_bal,0.0369
f__months_inactive_12_mon,0.0277
f__total_amt_chng_q4_q1,0.0154
f__avg_utilization_ratio,0.0138
f__credit_limit,0.0128


In [229]:
#тесты

In [231]:
from sklearn.preprocessing import MinMaxScaler

In [232]:
scaler = MinMaxScaler()

In [247]:
XX=train_raw[['__churn_prob','__price_predict']]
XX[['__churn_prob','__price_predict']]=MinMaxScaler().fit_transform(XX)
XX["lbl"]=1

In [309]:
#y-x>=1
tg=1

for i in range(len(XX)):
    x=XX['__churn_prob'].iloc[i]
    y=XX['__price_predict'].iloc[i]
    for j in range(100,-100,-1):
        if y-tg*x>=j/100 and y-tg*x<=(j+1)/100:
            XX['lbl'].iloc[i] = j+100

In [310]:
fig = px.scatter(XX, x='__churn_prob', y='__price_predict', color="lbl")
#, color="lbl"
fig.show()

Идея заклчается в следующем:
С точки зрения экономики "хорошим" влодением можно считать либо рискованое вложение, обещающее высокий доход, либо относительно безрисковое, но и с небольшим доходом. В этом смысле самое худшее вложение - это высокий риск и низкий доход. 

Если отнормировать данные от 0 до 1 и представить на координатной плоскости, то получится, что это соответсвует окрестности точки (1,0) - поскольку почти с вероятностью 1 произойдет задержка дохода, а даже если задержки не произойдет - премия за риск будет минимальна.

Исходя из этих нехитрых соображений будем выставлять приоритет как евклидово расстояние от точки (1,0). Также (это реализовано уже в функции alg1) дополним это двумя параметрами: tg и shift: первое задает расстояние не в виде окружности, а в виде какого-то эллипса, а второе - это параметр сдвига, которым мы задаем минимальное расстояние для выдачи кредита.

In [425]:
XX=train_raw[['__churn_prob','__price_predict']]
XX[['__churn_prob','__price_predict']]=MinMaxScaler().fit_transform(XX)
XX["lbl"]=0
tg=1
for i in range(len(XX)):
    x=XX['__churn_prob'].iloc[i]
    y=XX['__price_predict'].iloc[i]
    XX["lbl"].iloc[i] = (y)**2+tg*(x-1)**2

In [426]:
fig = px.scatter(XX, x='__churn_prob', y='__price_predict', color="lbl")
#, color="lbl"
fig.show()