In [357]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.feature_extraction import DictVectorizer
import pickle

In [185]:
df = pd.read_csv('data/dune_good_vs_bad_trader.csv')

In [187]:
# Shuffle the entire DataFrame
df = shuffle(df, random_state=42).reset_index(drop=True)

In [195]:
# Remove emojis and the space after them
df['target_variable'] = df['target_variable'].str.replace(r'^[üî¥üü¢]\s*', '', regex=True)
df['trader_activity_status'] = df['trader_activity_status'].str.replace(r'^[üê£üê§üê¶]\s*', '', regex=True)
df['trader_volume_status'] = df['trader_volume_status'].str.replace(r'^[ü¶êüê≥üêü]\s*', '', regex=True)
df['trader_weekly_frequency_status'] = df['trader_weekly_frequency_status'].str.replace(r'^[üê£üê§üê¶]\s*', '', regex=True)

In [199]:
df['target_variable'] = df['target_variable'].map({'Good Trader': 1, 'Bad Trader': 0})


In [383]:
numerical = ['active_weeks', 'total_volume', 'tx_count_365d']

categorical = ['trader_activity_status', 'trader_weekly_frequency_status']

In [201]:
df

Unnamed: 0,active_weeks,target_variable,total_volume,trader_activity_status,trader_volume_status,trader_weekly_frequency_status,tx_count_365d,wallet
0,8,1,34876.863010,Frequent User,Middle Value Trader,OG,19,0xbba86bbc1945847513abd35c30d420da6d59ce9e
1,4,0,57.337284,Regular User,Low Value Trader,OG,8,0x003f1ff692a291651dcc6bc15f941bb4756fc019
2,4,0,3.352370,Frequent User,Low Value Trader,OG,14,0x33340753d4476a76ac34e725e5117666e0f59d4d
3,4,0,30.817263,Regular User,Low Value Trader,OG,6,0x13f2c402c15c860284fae850d7123e1007312b63
4,1,0,561.349378,Occasional User,Middle Value Trader,Unserious,1,0xebf660c819e0c25b35eda64a89d7923c7fcdf47b
...,...,...,...,...,...,...,...,...
9995,24,1,17317.132373,Frequent User,Middle Value Trader,OG,77,0x779582efdbbef0cf8fb50c01839690385651c58e
9996,15,1,3003.430424,Frequent User,Middle Value Trader,OG,32,0x82799b4f02d9429030820b7fa611a828d0d01381
9997,5,1,3608.988073,Frequent User,Middle Value Trader,OG,11,0xbd1a67411c23c6209bbf78fac9e03e18db18dcd8
9998,1,0,1.238881,Occasional User,Low Value Trader,Unserious,1,0x189bbe4c4aadd86a9625a77e13ec3a52d430d665


In [239]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [451]:
def train(df_train, y_train):
    dicts = df_train[categorical + numerical].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    return dv, model

In [453]:
def predict(df, dv, model):
    dicts = df[categorical + numerical].to_dict(orient='records')

    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [455]:
C = 1.0
n_splits = 5

In [457]:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

scores = []

for train_idx, val_idx in kfold.split(df_full_train):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]

    y_train = df_train.target_variable.values
    y_val = df_val.target_variable.values

    dv, model = train(df_train, y_train)
    y_pred = predict(df_val, dv, model)

    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)

print('%.3f +- %.3f' % (np.mean(scores), np.std(scores)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.935 +- 0.011


In [458]:
scores

[0.9261178861788617,
 0.939044725643442,
 0.9549702421410032,
 0.9303899643526353,
 0.9265381255752394]

In [461]:
dv, model = train(df_full_train, df_full_train.target_variable.values)
y_pred = predict(df_test, dv, model)

y_test = df_test.target_variable.values
auc = roc_auc_score(y_test, y_pred)
auc

0.9293228691659067

In [359]:
output_file = 'good_bad_trader_log_reg.bin'

In [363]:
with open(output_file, 'wb') as f_out: 
    pickle.dump((dv, model), f_out)

In [365]:
# Load the model
input_file = 'good_bad_trader_log_reg.bin'

In [367]:
with open(input_file, 'rb') as f_in: 
    dv, model = pickle.load(f_in)

In [371]:
model

In [381]:
df_test.loc[3886]

active_weeks                                                               9
target_variable                                                            1
total_volume                                                      761.279961
trader_activity_status                                         Frequent User
trader_volume_status                                     Middle Value Trader
trader_weekly_frequency_status                                            OG
tx_count_365d                                                             20
wallet                            0x94a5458bad9b21190f42a392512845707d91182f
Name: 3886, dtype: object

In [433]:
trader = {
    'active_weeks': 9,
    'total_volume': 761,
    'trader_activity_status' : 'Middle Value Trader',
    'trader_weekly_frequency_status': 'OG',
    'tx_count_365d': 20,
    'community_member': 'not_sure'
}

In [435]:
X = dv.transform([trader])

In [437]:
y_pred = model.predict_proba(X)[0, 1]
y_pred

0.7174203016473216