In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

sns.set()

# Loading the data

In [2]:
sampleSubDf = pd.read_csv('SampleSubmission.csv')

client_train_df = pd.read_csv('client_train.csv')
client_test_df = pd.read_csv('client_test.csv')

invoice_train_df = pd.read_csv('invoice_train.csv')
invoice_test_df = pd.read_csv('invoice_test.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Feature engineering

In [3]:
def client_feature_change(client_df):   
    client_df['creation_date'] = pd.to_datetime(client_df['creation_date'])

    #'coop_time' - amount of time since account creation in months
    client_df['coop_time'] = (2021 - client_df['creation_date'].dt.year)*12 - client_df['creation_date'].dt.month

client_feature_change(client_train_df)
client_feature_change(client_test_df)

In [4]:
def invoice_feature_change(invoice_df):
    invoice_df['counter_type'] = invoice_df['counter_type'].map({"ELEC":1,"GAZ":0})
    invoice_df['counter_statue'] = invoice_df['counter_statue'].map({0:0,1:1,2:2,3:3,4:4,5:5,769:5,'0':0,'5':5,'1':1,'4':4,'A':0,618:5,269375:5,46:5,420:5})

    invoice_df['invoice_date'] = pd.to_datetime(invoice_df['invoice_date'], dayfirst=True)
    invoice_df['invoice_month'] = invoice_df['invoice_date'].dt.month
    invoice_df['invoice_year'] = invoice_df['invoice_date'].dt.year

    invoice_df['is_weekday'] = ((pd.DatetimeIndex(invoice_df.invoice_date).dayofweek) // 5)
    invoice_df['delta_index'] = invoice_df['new_index'] - invoice_df['old_index']
    
    invoice_df = invoice_df.sort_values(by=['client_id','invoice_date'])
    invoice_df['delta_time'] = invoice_df.groupby('client_id')['invoice_date'].diff().dt.days
    
    return invoice_df

invoice_train_df = invoice_feature_change(invoice_train_df)
invoice_test_df = invoice_feature_change(invoice_test_df)

In [5]:
def agg_feature(invoice, client_df, agg_stat):
    
    agg_trans = invoice.groupby('client_id')[agg_stat+['delta_time']].agg(['mean','std','min','max'])
    
    agg_trans.columns = ['_'.join(col).strip() for col in agg_trans.columns.values]
    agg_trans.reset_index(inplace=True)

    df = invoice.groupby('client_id').size().reset_index(name='transactions_count')
    agg_trans = pd.merge(df, agg_trans, on='client_id', how='left')
    
    weekday_avg = invoice.groupby('client_id')[['is_weekday']].agg(['mean'])
    weekday_avg.columns = ['_'.join(col).strip() for col in weekday_avg.columns.values]
    weekday_avg.reset_index(inplace=True)
    client_df = pd.merge(client_df, weekday_avg, on='client_id', how='left')
    
    unique_count = invoice.groupby('client_id')[['counter_code', 'counter_number']].agg(['nunique'])
    unique_count.columns = ['_'.join(col).strip() for col in unique_count.columns.values]
    unique_count.reset_index(inplace=True)
    client_df = pd.merge(client_df, unique_count, on='client_id', how='left')
    
    df = invoice.copy()
    df['new_index_shift'] = df.groupby('client_id')['new_index'].shift()
    df = df.dropna()
    df['new_old_index_not_equal'] = (df['old_index'] != df['new_index_shift']).astype(int)
    df = df.groupby('client_id')[['new_old_index_not_equal']].sum()
    df.reset_index(inplace=True)
    client_df = pd.merge(client_df, df, on='client_id', how='left')
    
    full_df = pd.merge(client_df, agg_trans, on='client_id', how='left')
    
    full_df['invoice_per_cooperation'] = full_df['transactions_count'] / full_df['coop_time']
    
    return full_df

agg_stat_columns = [
 'tarif_type',
 'counter_number',
 'counter_statue',
 'counter_code',
 'reading_remarque',
 'consommation_level_1',
 'consommation_level_2',
 'consommation_level_3',
 'consommation_level_4',
 'old_index',
 'new_index',
 'months_number',
 'counter_type',
 'invoice_month',
 'invoice_year',
 'delta_index'
]

train_df = agg_feature(invoice_train_df, client_train_df, agg_stat_columns)
test_df = agg_feature(invoice_test_df, client_test_df, agg_stat_columns)

In [6]:
def new_features(df):
    for col in agg_stat_columns:
        df[col+'_range'] = df[col+'_max'] - df[col+'_min']
        df[col+'_max_mean'] = df[col+'_max']/df[col+'_mean']
    
    return df

train_df = new_features(train_df)
test_df = new_features(test_df)

train_df = train_df.drop(['client_id', 'creation_date'] + ['reading_remarque_max','counter_statue_min','counter_type_min','counter_type_max','counter_type_range',
          'tarif_type_max', 'delta_index_min', 'consommation_level_4_mean'], axis=1)
test_df = test_df.drop(['client_id', 'creation_date'] + ['reading_remarque_max','counter_statue_min','counter_type_min','counter_type_max','counter_type_range',
          'tarif_type_max', 'delta_index_min', 'consommation_level_4_mean'], axis=1)
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)
train_df

Unnamed: 0,disrict,client_catg,region,target,coop_time,is_weekday_mean,counter_code_nunique,counter_number_nunique,new_old_index_not_equal,transactions_count,...,new_index_max_mean,months_number_range,months_number_max_mean,counter_type_max_mean,invoice_month_range,invoice_month_max_mean,invoice_year_range,invoice_year_max_mean,delta_index_range,delta_index_max_mean
0,60,11,101,0.0,312,0.028571,2,1,5.0,35,...,1.741318,10,2.592593,1.000000,11,1.794872,14,1.003907,1348,3.818482
1,69,11,107,0.0,223,0.054054,1,1,19.0,37,...,1.680649,6,1.850000,1.000000,9,1.947368,14,1.003560,1017,2.164865
2,62,11,301,0.0,417,0.055556,2,1,5.0,18,...,1.323484,8,1.862069,1.000000,9,1.663866,14,1.004006,2894,3.684399
3,69,11,105,0.0,289,0.050000,1,1,0.0,20,...,1.195595,2,1.428571,1.000000,9,1.801802,7,1.001693,15,12.500000
4,62,11,303,0.0,74,0.285714,1,1,10.0,14,...,1.754937,2,1.076923,1.000000,10,1.584906,4,1.001062,2258,2.581714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135488,62,11,304,0.0,197,0.014085,2,3,53.0,71,...,1.918919,6,1.893333,1.918919,8,1.655012,14,1.003198,0,0.000000
135489,63,11,311,0.0,98,0.000000,2,3,29.0,41,...,4.435687,2,1.025000,2.050000,8,1.653226,6,1.001573,831,4.453143
135490,63,11,311,0.0,109,0.194444,2,2,28.0,36,...,3.236447,4,1.945946,1.636364,10,1.741935,7,1.002137,1042,3.936527
135491,60,11,101,0.0,324,0.000000,1,1,0.0,2,...,1.026858,4,1.333333,1.000000,4,1.200000,0,1.000000,329,1.443995


# Model and evaluation

In [7]:
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier

target = train_df[['target']]
train = train_df.drop(['target'], axis=1)
X_train, X_eval, y_train, y_eval = train_test_split(train, target, random_state=1, test_size=0.2)

In [8]:
model = XGBClassifier(learning_rate=0.19)
model.fit(X_train, y_train)
train_predictions = model.predict_proba(X_train)
eval_predictions = model.predict_proba(X_eval)

  return f(**kwargs)


In [16]:
def optimize(x):
    if x < 0.15:
        return 0
    else:
        return 1
train_predict = [optimize(x) for x in train_predictions[:, 1]]
eval_predict = [optimize(x) for x in eval_predictions[:, 1]]

from sklearn.metrics import f1_score, precision_recall_fscore_support, roc_auc_score
train_score = precision_recall_fscore_support(y_train, train_predict)
eval_score = precision_recall_fscore_support(y_eval, eval_predict)
print(f'train_score: {[x[1] for x in train_score]} \n eval_score: {[x[1] for x in eval_score]}')

train_score: [0.4913973982375157, 0.7696352283930332, 0.5998207196824178, 6086] 
 eval_score: [0.35498366013071897, 0.5871621621621622, 0.4424643584521385, 1480]


# Submission

In [None]:
model.fit(train, target)
predictions = model.predict_proba(test_df)
predictions = [optimize(x) for x in predictions[:, 1]]

submission = pd.DataFrame({
        "client_id": sampleSubDf["client_id"],
        "target": predictions
    })
submission.to_csv('submissions/submission_4.csv', index=False)

In [None]:
target['target'].value_counts() / len(target)