In [1]:
# general
import pyarrow.parquet as pq
import pandas as pd
from scipy import stats
import random

# scoring
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# drawing
import seaborn as sns
import matplotlib.pyplot as plt

# transform
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

# estimators
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
# import xgboost as xgb
# import lightgbm as lgb


# # spark
# import findspark
# findspark.init()
# import pyspark
# from pyspark.sql import SparkSession

In [2]:
def open_df(part):
    
    import pandas as pd
    
    # open
    df = pd.read_csv('train_data/tst_'+str(part)+'.csv', header=None)
    
    # очень неудобные названия
    col_names = dict()
    for i in range(len(df.columns)):
                       col_names[df.columns[i]] = cols[i]
    df = df.rename(columns=col_names)
    
    answers = pd.read_csv('train_target.csv')
    df = pd.merge(df, answers, on='id', how='left')
    
    return df

In [3]:
def sample_df(df3, total_rows =  100000, neg_percent = 50, pos_percent = 50):
#     print( 'sample_df3 start')
    df3_pos = df3[df3['flag'] == 1].sample(int(total_rows / 100 * pos_percent))
    df3_neg = df3[df3['flag'] == 0].sample(int(total_rows / 100 * neg_percent))
    df3_pos = df3_pos.reset_index()
    df3_neg = df3_neg.reset_index()
    df3_pos = df3_pos.drop('index', axis=1)
    df3_neg = df3_neg.drop('index', axis=1)
    df3 = pd.concat([df3_pos, df3_neg])
    
#     print( 'sample_df3 end')
#     print('-')      
#     print('-')      
    #print('-') 
    
    return df3

In [4]:
def mean(lst): 
    return sum(lst) / len(lst) 

In [5]:
def engeneering_new_cols(df):
# listing cols to drop
    cols_to_drop = []
    for entry in df.columns:
        if 'enc_paym_' in entry :
            cols_to_drop.append(entry)
            
    cols_to_drop = cols_to_drop + ['pre_till_fclose', 'pre_till_pclose', 'pre_since_opened', 'pre_since_confirmed', 'pclose_flag']

# new column, product-wise
    df['closed_faster_than_expected'] = df['pre_fterm'] < df['pre_pterm'] 
    df['closed_faster_than_expected'] = df['closed_faster_than_expected'].apply(lambda x: int(x))
    
# new column, product-wise
    df['overdue_severity'] = 5 - (df['is_zero_loans5'] + df['is_zero_loans530'] + df['is_zero_loans3060'] + df['is_zero_loans6090'] + df['is_zero_loans90']) 
    
# new column, client-wise
    df['prone_to_overdue'] = 0

    for entry in df.id.unique():
        df.loc[df.id == entry, 'prone_to_overdue'] = sum(df[df.id == entry]['overdue_severity']) / df.loc[df.id == entry, 'rn'].shape[0]

# new column, client-wise
    df['prone_to_close_faster'] = 0

    for entry in df.id.unique():
        df.loc[df.id == entry, 'prone_to_close_faster'] = sum(df[df.id == entry]['closed_faster_than_expected']) / df.loc[df.id == entry, 'rn'].shape[0]
    
    return df

In [6]:
cols = ['id', 'rn', 'pre_since_opened', 'pre_since_confirmed', 'pre_pterm', 'pre_fterm', 'pre_till_pclose', 'pre_till_fclose', 'pre_loans_credit_limit', 'pre_loans_next_pay_summ',
 'pre_loans_outstanding', 'pre_loans_total_overdue', 'pre_loans_max_overdue_sum', 'pre_loans_credit_cost_rate', 'pre_loans5', 'pre_loans530', 'pre_loans3060', 'pre_loans6090', 'pre_loans90', 'is_zero_loans5',
 'is_zero_loans530', 'is_zero_loans3060', 'is_zero_loans6090', 'is_zero_loans90', 'pre_util', 'pre_over2limit', 'pre_maxover2limit', 'is_zero_util', 'is_zero_over2limit', 'is_zero_maxover2limit', 'enc_paym_0',
 'enc_paym_1', 'enc_paym_2', 'enc_paym_3', 'enc_paym_4', 'enc_paym_5', 'enc_paym_6', 'enc_paym_7', 'enc_paym_8', 'enc_paym_9', 'enc_paym_10', 'enc_paym_11','enc_paym_12', 'enc_paym_13', 'enc_paym_14', 'enc_paym_15',
 'enc_paym_16', 'enc_paym_17', 'enc_paym_18', 'enc_paym_19', 'enc_paym_20', 'enc_paym_21', 'enc_paym_22', 'enc_paym_23', 'enc_paym_24', 'enc_loans_account_holder_type', 'enc_loans_credit_status','enc_loans_credit_type', 'enc_loans_account_cur', 'pclose_flag',
 'fclose_flag']

In [9]:
df = open_df(0)
df2 = open_df(1)
df3 = open_df(2)

In [10]:
answers = pd.read_csv('train_target.csv')

df = pd.merge(df, answers, on='id', how='left')
df2 = pd.merge(df2, answers, on='id', how='left')
df3 = pd.merge(df3, answers, on='id', how='left')

In [None]:
df_cut_1 = pd.read_csv('train_data/df_only_last_product_1.csv')
df_cut_1 = df_cut_1.drop(0, axis=0)
df_cut = pd.read_csv('train_data/df_only_last_product_0.csv')
df_cut_merged = pd.concat([df_cut, df_cut_1], ignore_index=True)
# df_cut_merged = df_cut_merged.drop(['id'], axis=1)

# 7 гиппотеза. голосование моделей. попробуем на сете сразу из трех чанков.

порядок работы в этой гиппотезе такой:
1) раз делать хотим из 3х чанков - нужен 4й для валидации. распакуем спарком еще кусок данных в цсв
2) делаем сбалансированный сет из трех чанков, валидируем CV-шкой, записываем 3 лучшие модели (или 2?) и результаты
2.1) а вот вопрос - как делать голосование - большинством (и тогда 3 модели) или по большей вероятности? (и тогда 2 модели)
2.2) валидируем вручную на трейне, 3 прогона, среднее, записываем
3) валидируем на тесте (сет из 4-го куска данных (боже как я ненавиижу слово "чанк" в переложении на русский) человека, который придумал ввести его в обиход в траскрипции без изменений нужно насильно отлучить от русского!)
3) так вот. записываем результаты по каждой модели
4) валидируем на трейне большинством
5) валидируем на трейне по большей вероятности
6) валидируем на тесте большинством
7) валидируем на тесте по большей вероятности
8) расстраиваемся, что опять нифига не получилось

from pyspark.sql import SparkSession



spark = SparkSession.builder\
        .master("local[*]")\
        .appName('PySpark_Tutorial')\
        .getOrCreate()

tst = spark.read.option("header",True).option("delimiter",",").parquet("train_data/train_data_3.pq") #option("header",True).option("delimiter",",").
tst.show(2)

tst2 = tst.toPandas()

In [21]:
share = 45

df_cut_1 = sample_df(
    df, 
    round(df.flag.value_counts()[1] / share * 100 + 1, 0),
    100 - share,
    share
)

df_cut_2 = sample_df(
    df2, 
    round(df2.flag.value_counts()[1] / share * 100 + 1, 0),
    100 - share,
    share
)

df_cut_3 = sample_df(
    df3, 
    round(df3.flag.value_counts()[1] / share * 100 + 1, 0),
    100 - share,
    share
)

In [24]:
df_cut_merged = pd.concat([df_cut_1, df_cut_2, df_cut_3], ignore_index=True)


In [27]:
df_cut_merged.flag.value_counts()

flag
0    229259
1    187576
Name: count, dtype: int64

In [28]:
models = [
#     KNeighborsClassifier(),
    SVC(),
    RandomForestClassifier(),
    MLPClassifier(),
    LogisticRegression(),
#     DecisionTreeClassifier(),
    GaussianNB(),
]
#     xgb(),
#     lgb(),
# ]

In [31]:
df_cut_merged.flag.value_counts()

flag
0    229259
1    187576
Name: count, dtype: int64

In [32]:
df_cut_merged = sample_df(df_cut_merged, 200000, 55, 45)

In [33]:
x = df_cut_merged.drop('flag', axis=1)
y = df_cut_merged.flag

In [34]:
results_dict = dict()

In [35]:
for model in models:
    
    cv_scores = cross_val_score(model, x, y, cv=3, scoring='roc_auc')
    results_dict[str(model)[0:str(model).find('(')]] = mean(cv_scores)




In [36]:
results_dict

{'SVC': 0.5002394363932878,
 'RandomForestClassifier': 0.6496782482736708,
 'MLPClassifier': 0.5404545641659435,
 'LogisticRegression': 0.4979999459787403,
 'GaussianNB': 0.5823624441224031}

In [15]:
results_dict = {'SVC': 0.5002394363932878,
 'RandomForestClassifier': 0.6496782482736708,
 'MLPClassifier': 0.5404545641659435,
 'LogisticRegression': 0.4979999459787403,
 'GaussianNB': 0.5823624441224031}

In [22]:
results_dict

{'SVC': 0.5002394363932878,
 'RandomForestClassifier': 0.6496782482736708,
 'MLPClassifier': 0.5404545641659435,
 'LogisticRegression': 0.4979999459787403,
 'GaussianNB': 0.5823624441224031}

In [23]:
model_1 = RandomForestClassifier()
model_2 = GaussianNB()
model_3 = MLPClassifier()

In [29]:
share = 45

df_cut_1 = sample_df(
    df, 
    round(df.flag.value_counts()[1] / share * 100 + 1, 0),
    100 - share,
    share
)

df_cut_2 = sample_df(
    df2, 
    round(df2.flag.value_counts()[1] / share * 100 + 1, 0),
    100 - share,
    share
)

df_cut_3 = sample_df(
    df3, 
    round(df3.flag.value_counts()[1] / share * 100 + 1, 0),
    100 - share,
    share
)

In [32]:
df4 = open_df(3)

In [31]:
df_cut_merged_too_big = pd.concat([df_cut_1, df_cut_2, df_cut_3])

df_cut_merged_discard, df_cut_merged = train_test_split(
    df_cut_merged_too_big, 
    stratify=df_cut_merged_too_big['flag'], 
    test_size=0.3
)


x_train_merged = df_cut_merged.drop('flag', axis=1)
y_train_merged = df_cut_merged.flag

model_1_uni = RandomForestClassifier()
model_2_uni = GaussianNB()
model_3_uni = MLPClassifier()

model_1_uni.fit(x_train_merged, y_train_merged)
model_2_uni.fit(x_train_merged, y_train_merged)
model_3_uni.fit(x_train_merged, y_train_merged)



In [33]:
x_test = df4.drop('flag', axis=1)
y_test = df4.flag

In [36]:
pred_probs_1_uni = model_1_uni.predict_proba(x_test)[:, 1]
# pred_1_uni = model_1_uni.predict(x_test)
roc_auc = roc_auc_score(y_test, pred_probs_1_uni) 
print( roc_auc  )


pred_probs_2_uni = model_2_uni.predict_proba(x_test)[:, 1]
# pred_2_uni = model_2_uni.predict(x_test)
roc_auc = roc_auc_score(y_test, pred_probs_2_uni) 
print( roc_auc  )


pred_probs_3_uni = model_3_uni.predict_proba(x_test)[:, 1]
# pred_3_uni = model_3_uni.predict(x_test)
roc_auc = roc_auc_score(y_test, pred_probs_3_uni) 
print( roc_auc  )

0.6093650400154798
0.5738155311239542
0.5724079960951818


In [47]:
pred_probs = pd.DataFrame.from_records([pred_probs_1_uni, pred_probs_2_uni, pred_probs_3_uni])
pred_probs = pred_probs.T
# pred_probs['final'] = round(mean([preds[0], preds[1], preds[2] ]), 0)
# pred_probs



# roc_auc = roc_auc_score(y_test, y_pred_prob)



In [44]:
pred_probs['final'] =  mean([pred_probs[0], pred_probs[1], pred_probs[2] ])

In [46]:
roc_auc = roc_auc_score(y_test, pred_probs['final']) 
print( roc_auc  )


0.6149223615743065


In [30]:
x_train_1 = df_cut_1.drop('flag', axis=1)
y_train_1 = df_cut_1.flag

x_train_2 = df_cut_2.drop('flag', axis=1)
y_train_2 = df_cut_2.flag

x_train_3 = df_cut_3.drop('flag', axis=1)
y_train_3 = df_cut_3.flag

model_1.fit(x_train_1, y_train_1)
model_2.fit(x_train_2, y_train_2)
model_3.fit(x_train_3, y_train_3)



In [40]:
pred_probs_1 = model_1.predict_proba(x_test)[:, 1]
roc_auc = roc_auc_score(y_test, pred_probs_1) 
print( roc_auc  )

pred_probs_2 = model_2.predict_proba(x_test)[:, 1]
roc_auc = roc_auc_score(y_test, pred_probs_2) 
print( roc_auc  )

pred_probs_3 = model_3.predict_proba(x_test)[:, 1]
roc_auc = roc_auc_score(y_test, pred_probs_3) 
print( roc_auc  )

0.6136492193267971
0.5753253913855202
0.5850299787144031


In [48]:
pred_probs = pd.DataFrame.from_records([pred_probs_1, pred_probs_2, pred_probs_3])
pred_probs = pred_probs.T
pred_probs['final'] =  mean([pred_probs[0], pred_probs[1], pred_probs[2] ])
roc_auc = roc_auc_score(y_test, pred_probs['final']) 
print( roc_auc  )


0.6016955807880824


In [49]:
pred_probs

Unnamed: 0,0,1,2,final
0,0.74,0.267445,1.645613e-58,0.335815
1,0.59,0.462343,3.705105e-36,0.350781
2,0.38,0.148550,9.593967e-85,0.176183
3,0.37,0.722362,3.015822e-51,0.364121
4,0.43,0.249320,2.153935e-67,0.226440
...,...,...,...,...
2112587,0.38,0.631642,9.157182e-63,0.337214
2112588,0.37,0.512132,3.805894e-68,0.294044
2112589,0.31,0.620992,2.403098e-52,0.310331
2112590,0.32,0.413201,4.594044e-69,0.244400


# Сбалансировать таргет 50/50 #


In [40]:
df_balanced = sample_df(df, 100000, 50, 50)

sample_df3 start
sample_df3 end
-


In [12]:
results_dict = dict()

In [42]:
models = [
    KNeighborsClassifier(),
#     SVC(),
#     RandomForestClassifier(),
#     MLPClassifier(),
    LogisticRegression(),
#     DecisionTreeClassifier(),
#     GaussianNB(),
]
#     xgb(),
#     lgb(),
# ]

In [41]:
x = df_balanced.drop('flag', axis=1)
y = df_balanced['flag']


In [43]:
model = KNeighborsClassifier()

In [44]:
df_train, df_test = train_test_split(df_balanced, stratify=df_balanced['flag'], test_size=0.3)

x_train = df_train.drop('flag', axis=1)
y_train = df_train.flag
x_test = df_test.drop('flag', axis=1)
y_test = df_test.flag

model = KNeighborsClassifier()
model.fit(x_train, y_train)

pred_train = model.predict(x_train)
pred_test = model.predict(x_test)
print('train score acc - ', accuracy_score(y_train, pred_train))
print('test score acc - ', accuracy_score(y_test, pred_test))


prob_train = model.predict_proba(x_train)[:, 1]
prob_test = model.predict_proba(x_test)[:, 1]
print('train score roc - ', roc_auc_score(y_train, prob_train))
print('test score roc - ', roc_auc_score(y_test, prob_test))



train score acc -  0.8264857142857143
test score acc -  0.7347666666666667
train score roc -  0.9163496330612244
test score roc -  0.8027531


In [36]:
model.fit(x,y)

In [37]:
df2_balanced  = sample_df(df2, 100000, 50, 50)

sample_df3 start
sample_df3 end
-


In [38]:
x_test = df2_balanced.drop('flag', axis=1)
y_test = df2_balanced['flag']

In [39]:
y_pred_prob = model.predict_proba(x_test)[:, 1]

# Calculate the ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)

print("ROC AUC Score:", roc_auc)

ROC AUC Score: 0.49996999999999997


In [23]:
for model in models:
    
    cv_scores = cross_val_score(model, x, y, cv=3, scoring='roc_auc')
    results_dict[str(model)[0:str(model).find('(')]] = mean(cv_scores)


In [24]:
results_dict

{'LogisticRegression': 0.5402874331389534,
 'KNeighborsClassifier': 0.7992882246846347}

In [None]:
model = results_dict[????????]

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='roc_auc')
grid_search.fit(x, y)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

In [15]:
new_row = dict()
for col in cols:
    new_row[col] = list(df.loc[df['id'] == 0,col])


In [16]:
new_row_df = pd.DataFrame.from_records(new_row)

new_row_df.iloc[1,1] = [1,2]

In [17]:
answers

Unnamed: 0,id,flag
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
2999995,2999995,0
2999996,2999996,0
2999997,2999997,0
2999998,2999998,0


sns.heatmap(df_balanced.corr(), cmap='coolwarm', linewidths=0.5)


df_corr = df_balanced.corr()

columns = df_corr.columns
rows = df_corr.columns

results_acc = [[]]*len(threshholds)

results_acc

In [100]:
threshholds = [0.01, 0.05, 0.08]#,[0.02, 0.03, 0.04, 0.05, 0.06, 0.07]
results_acc = [[]]*len(threshholds)
results_roc = [[]]*len(threshholds)
counter = 0
for entry_1 in threshholds:
    
    print('threshhold is ', entry_1)

    corrs = []
    for col in columns:
        corrs.append((df_balanced.flag.corr(df_balanced[col]), col))
    corrs = corrs[:-1]
    corrs = sorted(corrs)


    significant_cols = []

    threshhold = entry_1

    for entry in corrs:
        if entry[0] < -threshhold or entry[0] > threshhold:
            significant_cols.append(entry[1])
    significant_cols.append('flag')
    
    
    df_balanced_cut = df_balanced[significant_cols]
    
    for i in range(3):
        
        df_train, df_test = train_test_split(df_balanced_cut, stratify=df_balanced_cut['flag'], test_size=0.3)

        x_train = df_train.drop('flag', axis=1)
        y_train = df_train.flag
        x_test = df_test.drop('flag', axis=1)
        y_test = df_test.flag

        model = RandomForestClassifier()
        model.fit(x_train, y_train)

        
        pred_train = model.predict(x_train)
        pred_test = model.predict(x_test)
    #     print('train score acc - ', accuracy_score(y_train, pred_train))
    #     print('test score acc - ', accuracy_score(y_test, pred_test))
        results_acc[counter].append(accuracy_score(y_test, pred_test))


        prob_train = model.predict_proba(x_train)[:, 1]
        prob_test = model.predict_proba(x_test)[:, 1]
    #     print('train score roc - ', roc_auc_score(y_train, prob_train))
    #     print('test score roc - ', roc_auc_score(y_test, prob_test))
        results_roc[counter].append(roc_auc_score(y_test, prob_test))
        
        print(i, ' fits done in loop ', counter)



    counter += 1

threshhold is  0.01
0  fits done in loop  0
1  fits done in loop  0
2  fits done in loop  0
threshhold is  0.05
0  fits done in loop  1
1  fits done in loop  1
2  fits done in loop  1
threshhold is  0.08
0  fits done in loop  2
1  fits done in loop  2
2  fits done in loop  2


In [106]:
results_roc

[[0.6377428977777777,
  0.6372906488888889,
  0.6340239244444446,
  0.609586251111111,
  0.6111367733333335,
  0.6120779888888889,
  0.60261152,
  0.5999410177777779,
  0.6023576733333333],
 [0.6377428977777777,
  0.6372906488888889,
  0.6340239244444446,
  0.609586251111111,
  0.6111367733333335,
  0.6120779888888889,
  0.60261152,
  0.5999410177777779,
  0.6023576733333333],
 [0.6377428977777777,
  0.6372906488888889,
  0.6340239244444446,
  0.609586251111111,
  0.6111367733333335,
  0.6120779888888889,
  0.60261152,
  0.5999410177777779,
  0.6023576733333333]]

for i in range(len(results_roc)):
    print('for threshhold ',threshholds[i], ' avg is ', results_roc[i].mean())
for i in range(len(results_roc)):
    print('for threshhold ',threshholds[i], ' avg is ', results_acc[i].mean())

In [84]:
df_train, df_test = train_test_split(df_balanced, stratify=df_balanced['flag'], test_size=0.3)

In [85]:
x_train = df_train.drop('flag', axis=1)
y_train = df_train.flag
x_test = df_test.drop('flag', axis=1)
y_test = df_test.flag

rf =  RandomForestClassifier()

rf.fit(x_train, y_train)

In [49]:
model = LogisticRegression()

In [86]:
model = RandomForestClassifier()

In [59]:
model  = MLPClassifier()

# готовый скрипт 

In [75]:
# путь до данных на компьютере
path = 'train_data/'

In [48]:
import os
import pandas as pd
import tqdm


def read_parquet_dataset_from_local(path_to_dataset: str, start_from: int = 0,
                                     num_parts_to_read: int = 2, columns=None, verbose=False) -> pd.DataFrame:
    """
    читает num_parts_to_read партиций, преобразовывает их к pd.DataFrame и возвращает
    :param path_to_dataset: путь до директории с партициями
    :param start_from: номер партиции, с которой нужно начать чтение
    :param num_parts_to_read: количество партиций, которые требуется прочитать
    :param columns: список колонок, которые нужно прочитать из партиции
    :return: pd.DataFrame
    """

    res = []
    dataset_paths = sorted([os.path.join(path_to_dataset, filename) for filename in os.listdir(path_to_dataset)
                              if filename.startswith('train')])
    print(dataset_paths)

    start_from = max(0, start_from)
    chunks = dataset_paths[start_from: start_from + num_parts_to_read]
    if verbose:
        print('Reading chunks:\n')
        for chunk in chunks:
            print(chunk)
    for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):
        print('chunk_path', chunk_path)
        chunk = pd.read_parquet(chunk_path,columns=columns)
        res.append(chunk)

    return pd.concat(res).reset_index(drop=True)

In [50]:
# answers
check = []
for i in range(len(answers)-1):
#     if i == 
    check.append(answers.id[i]+1 == answers.id[i+1])