In [385]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture

In [386]:
train_path = r'..\data\splitted_features\train.parquet'
test_path = r'..\data\splitted_features\test.parquet'
val_path =  r'..\data\splitted_features\validation.parquet'

In [387]:
train_path_ck = r'..\data\splitted\train.parquet'
test_path_ck = r'..\data\splitted\test.parquet'
val_path_ck =  r'..\data\splitted\validation.parquet'

In [388]:
train_df_ck = pd.read_parquet(train_path_ck)
test_df_ck = pd.read_parquet(test_path_ck)
val_df_ck = pd.read_parquet(val_path_ck)
 

In [389]:
train_vl = train_df_ck['fund_cnpj'].value_counts()

In [390]:
train_vl[train_vl < 300]

fund_cnpj
47192024000153    289
47212318000108    285
47947725000155    284
34027803000151    284
42847926000167    283
47936719000100    272
47716448000170    269
46351969000108    260
41778182000103    255
46449462000191    253
48698417000104    248
48106995000104    240
42905004000169    231
42905023000195    229
48329694000131    222
48373538000178    222
00829364000189    217
44361184000190    215
49450009000193    211
48948713000108    207
49843141000165    194
40463869000197    193
49996302000150    182
10396876000152    161
42904972000150    149
47716590000117    144
50679546000199    140
50443649000155    140
49984742000198    130
50955280000160    128
49962886000143    127
50786035000176    102
07908988000130     90
51540928000108     82
51598758000104     72
51013550000186     72
02233665000170     71
51890324000183     53
52008635000139     48
52286774000124     43
52874571000159     30
08823534000120     11
14017351000110      9
Name: count, dtype: int64

In [391]:
test_df_ck['fund_cnpj'].value_counts()

fund_cnpj
97519794000136    355
00068305000135    355
00071477000168    355
00073041000108    355
00180995000110    355
                 ... 
60606855000192     15
60302951000147     11
60762973000190     11
18507745000107      4
61031674000148      2
Name: count, Length: 2070, dtype: int64

In [392]:
val_df_ck['fund_cnpj'].value_counts()

fund_cnpj
97519794000136    128
57594567000150    128
57785049000114    128
58481981000116    128
59046429000162    128
                 ... 
36552549000136      9
01675497000100      9
45274700000102      8
31006723000121      5
51013550000186      3
Name: count, Length: 2049, dtype: int64

In [393]:
train_df = pd.read_parquet(train_path).dropna()
test_df = pd.read_parquet(test_path).dropna()
val_df = pd.read_parquet(val_path).dropna()
 


In [394]:
print(train_df.shape)
print(test_df.shape)
print(val_df.shape)

(1938, 29)
(1702, 29)
(1529, 29)


In [395]:
train_df.columns

Index(['mean_return', 'median_return', 'std_return', 'skew_return',
       'kurt_return', 'max_drawdown', 'avg_drawdown', 'avg_time_drawdown',
       'max_time_drawdown', 'avg_gross_by_net', 'avg_inflow', 'avg_redemption',
       'avg_shareholders', 'mean_std_5', 'mean_std_10', 'mean_std_15',
       'std_std_5', 'std_std_10', 'std_std_15', 'sharpe', 'sharpe_mean_std_5',
       'sharpe_mean_std_10', 'sharpe_mean_std_15', 'ret_by_DD',
       'ret_by_max_DD', 'ret_by_timedd', 'ret_by_timedd_max',
       'volatility_ratio_5_10', 'volatility_ratio_10_15'],
      dtype='object')

In [396]:
drop_cols = ['avg_inflow',  'avg_redemption',  'avg_gross_by_net',  'avg_shareholders',
             'max_drawdown', 'avg_drawdown', 'mean_std_5'
             
             ]

keep_cols = ['mean_return', 'median_return', 'std_return', 'sharpe', 'mean_std_5']

# train_df = train_df.drop(drop_cols, axis=1)
# test_df = test_df.drop(drop_cols, axis=1)
# val_df = val_df.drop(drop_cols, axis=1)
 

 
train_df = train_df[keep_cols]
test_df = test_df[keep_cols]
val_df = val_df[keep_cols]

In [397]:
train_df.columns

Index(['mean_return', 'median_return', 'std_return', 'sharpe', 'mean_std_5'], dtype='object')

In [398]:
# train_df['max_drawdown'].describe()

In [399]:
train_df.corr()



Unnamed: 0,mean_return,median_return,std_return,sharpe,mean_std_5
mean_return,1.0,0.600343,-0.124695,0.189465,-0.048579
median_return,0.600343,1.0,-0.286545,0.2517,-0.293271
std_return,-0.124695,-0.286545,1.0,-0.526357,0.971951
sharpe,0.189465,0.2517,-0.526357,1.0,-0.527772
mean_std_5,-0.048579,-0.293271,0.971951,-0.527772,1.0


In [400]:
scaler = StandardScaler()
behavior_cols = train_df.columns
X_train = scaler.fit_transform(train_df[behavior_cols])
X_test  = scaler.transform(test_df[behavior_cols])
X_val   = scaler.transform(val_df[behavior_cols])


In [401]:
pca = PCA(n_components=int(X_train.shape[1] / 2) )
pca.fit(train_df)

0,1,2
,n_components,2
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,


In [402]:
train_df_pca = pca.transform(X_train)
test_df_pca = pca.transform(X_test)
val_df_pca = pca.transform(X_val)

 



In [403]:
behavior_cols = [
    "mean_return",
    "std_return",
    "avg_drawdown",
    "max_drawdown"
]


X_train = train_df_pca
X_test  = test_df_pca
X_val   = val_df_pca 


In [404]:
results = {}
for k in [2, 3]:
    km = KMeans(n_clusters=k, random_state=42, n_init="auto")
    km.fit(X_train)
    labels = km.predict(X_test)
    score = silhouette_score(X_test, labels)
    results[k] = score

print("Silhouette scores:", results)

# Pick the best K (highest silhouette)
best_k = max(results, key=results.get)
print("Best k:", best_k)

Silhouette scores: {2: 0.5159861467852287, 3: 0.561699859588351}
Best k: 3


In [405]:
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init="auto")
gs = GaussianMixture(n_components=best_k, covariance_type='tied', random_state=0)

model = kmeans

train_clusters = model.fit_predict(X_train)
test_clusters  = model.predict(X_test)
val_clusters   = model.predict(X_val)


In [406]:
train_df.columns

Index(['mean_return', 'median_return', 'std_return', 'sharpe', 'mean_std_5'], dtype='object')

In [407]:
# 'mean_return', 'std_return', 
look_features = ['sharpe','avg_drawdown', 'avg_time_drawdown', 'max_time_drawdown']

In [408]:
(train_df / train_df.quantile(0.98)).max()

mean_return      1.850661
median_return    1.873420
std_return       2.364953
sharpe           3.036667
mean_std_5       1.662571
dtype: float64

In [409]:
train_df.idxmax().value_counts()

08336054000134    1
22232927000190    1
46449462000191    1
42847926000167    1
36986672000165    1
Name: count, dtype: int64

In [410]:
all_unique = pd.concat([
    train_df.idxmax(),
    test_df.idxmax(), 
    val_df.idxmax()
]).unique()


In [411]:
len(all_unique)

13

In [412]:
look_features = keep_cols

In [413]:
# Attach back to fund ids
train_clustered = train_df.copy()
train_clustered["cluster"] = train_clusters

test_clustered = test_df.copy()
test_clustered["cluster"] = test_clusters

val_clustered = val_df.copy()
val_clustered["cluster"] = val_clusters 

look_features

print("\nTrain cluster summary:")
print(train_clustered.groupby("cluster").mean()[keep_cols])

print("\nTest cluster summary:")
print(test_clustered.groupby("cluster").mean()[keep_cols])

print("\n Val cluster summary:")
print(val_clustered.groupby("cluster").mean()[keep_cols])




Train cluster summary:
         mean_return  median_return  std_return    sharpe  mean_std_5
cluster                                                              
0           0.000212       0.000234    0.014160  0.014825    0.012688
1           0.000374       0.000443    0.000211  1.906636    0.000071
2           0.000324       0.000377    0.002554  0.235946    0.002075

Test cluster summary:
         mean_return  median_return  std_return    sharpe  mean_std_5
cluster                                                              
0           0.000275       0.000194    0.012168  0.025171    0.010648
1           0.000442       0.000444    0.000209  2.731470    0.000141
2           0.000343       0.000349    0.003153  0.196566    0.002653

 Val cluster summary:
         mean_return  median_return  std_return    sharpe  mean_std_5
cluster                                                              
0          -0.000747       0.000794    0.020145  0.080636    0.012862
1           0.000546

In [414]:
print(train_clustered["cluster"].value_counts())
print(test_clustered["cluster"].value_counts())
print(val_clustered["cluster"].value_counts())




cluster
2    775
0    744
1    419
Name: count, dtype: int64
cluster
2    824
0    655
1    223
Name: count, dtype: int64
cluster
2    972
0    448
1    109
Name: count, dtype: int64


In [415]:
cluster_number = 1
outlier_fund = train_clustered[train_clustered["cluster"] == cluster_number]
outlier_fund

Unnamed: 0_level_0,mean_return,median_return,std_return,sharpe,mean_std_5,cluster
fund_cnpj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00068305000135,0.000337,0.000413,0.000176,1.911670,0.000052,1
00071477000168,0.000306,0.000395,0.000158,1.943234,0.000008,1
00073041000108,0.000343,0.000428,0.000160,2.149431,0.000013,1
00180995000110,0.000326,0.000388,0.000289,1.129620,0.000185,1
00222725000124,0.000324,0.000373,0.000238,1.362139,0.000131,1
...,...,...,...,...,...,...
68599141000106,0.000309,0.000395,0.000157,1.963340,0.000010,1
68623479000156,0.000338,0.000414,0.000176,1.917015,0.000052,1
68971183000126,0.000375,0.000465,0.000295,1.271148,0.000071,1
96500715000182,0.000348,0.000435,0.000166,2.091142,0.000016,1
