In [50]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:
train_path = r'..\data\splitted_features\train.parquet'
test_path = r'..\data\splitted_features\test.parquet'
val_path =  r'..\data\splitted_features\validation.parquet'

In [52]:
train_df = pd.read_parquet(train_path)
test_df = pd.read_parquet(test_path)
val_df = pd.read_parquet(val_path)
 


In [36]:
print(train_df.shape)
print(test_df.shape)
print(val_df.shape)

(2052, 16)
(2434, 16)
(2181, 16)


In [37]:
train_df.columns

Index(['mean_return', 'std_return', 'max_drawdown', 'avg_drawdown',
       'avg_gross_by_net', 'avg_inflow', 'avg_redemption', 'avg_shareholders',
       'mean_std_5', 'mean_std_10', 'mean_std_15', 'std_std_5', 'std_std_10',
       'std_std_15', 'sharpe', 'ret_by_DD'],
      dtype='object')

In [38]:
drop_cols = ['avg_inflow',  'avg_redemption',  'avg_shareholders', 'avg_gross_by_net', 'max_drawdown']

train_df = train_df.drop(drop_cols, axis=1)
test_df = test_df.drop(drop_cols, axis=1)
val_df = val_df.drop(drop_cols, axis=1)
 

In [39]:
train_df.corr()

Unnamed: 0,mean_return,std_return,avg_drawdown,mean_std_5,mean_std_10,mean_std_15,std_std_5,std_std_10,std_std_15,sharpe,ret_by_DD
mean_return,1.0,0.565498,0.008116,0.422081,0.333811,0.291887,0.121601,0.087925,0.080228,0.02635,-0.018476
std_return,0.565498,1.0,0.706815,0.761329,0.618494,0.534712,0.36248,0.255423,0.197047,-0.042229,-0.0226
avg_drawdown,0.008116,0.706815,1.0,0.616069,0.508835,0.43162,0.291334,0.196735,0.146061,-0.055563,-0.02855
mean_std_5,0.422081,0.761329,0.616069,1.0,0.838978,0.731404,0.713683,0.467122,0.357744,-0.034503,-0.033177
mean_std_10,0.333811,0.618494,0.508835,0.838978,1.0,0.876713,0.627812,0.707824,0.5442,-0.021693,-0.028628
mean_std_15,0.291887,0.534712,0.43162,0.731404,0.876713,1.0,0.554265,0.638446,0.699033,-0.010078,-0.029036
std_std_5,0.121601,0.36248,0.291334,0.713683,0.627812,0.554265,1.0,0.6926,0.55024,-0.041783,-0.011456
std_std_10,0.087925,0.255423,0.196735,0.467122,0.707824,0.638446,0.6926,1.0,0.814226,-0.013874,-0.02232
std_std_15,0.080228,0.197047,0.146061,0.357744,0.5442,0.699033,0.55024,0.814226,1.0,0.001178,-0.020757
sharpe,0.02635,-0.042229,-0.055563,-0.034503,-0.021693,-0.010078,-0.041783,-0.013874,0.001178,1.0,0.009919


In [40]:
scaler = StandardScaler()
behavior_cols = train_df.columns
X_train = scaler.fit_transform(train_df[behavior_cols])
X_val   = scaler.transform(test_df[behavior_cols])
X_test  = scaler.transform(val_df[behavior_cols])

In [41]:
pca = PCA(n_components=X_train.shape[1])
pca.fit(train_df)

0,1,2
,n_components,11
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,


In [42]:
train_df_pca = pca.transform(X_train)
val_df_pca = pca.transform(X_test)
test_df_pca = pca.transform(X_val)
 



In [43]:
behavior_cols = [
    "mean_return",
    "std_return",
    "avg_drawdown",
    "max_drawdown"
]


X_train = train_df_pca
X_val   = test_df_pca
X_test  = val_df_pca

In [44]:
results = {}
for k in [2, 3, 4, 5, 6]:
    km = KMeans(n_clusters=k, random_state=42, n_init="auto")
    labels = km.fit_predict(X_train)
    score = silhouette_score(X_train, labels)
    results[k] = score

print("Silhouette scores:", results)

# Pick the best K (highest silhouette)
best_k = max(results, key=results.get)
print("Best k:", best_k)

Silhouette scores: {2: 0.40024134023798846, 3: 0.320874164331956, 4: 0.24840026151288314, 5: 0.23143873360659847, 6: 0.23713929407980233}
Best k: 2


In [45]:
kmeans = KMeans(n_clusters=best_k, random_state=0, n_init="auto")
train_clusters = kmeans.fit_predict(X_train)
val_clusters   = kmeans.predict(X_val)
test_clusters  = kmeans.predict(X_test)

In [46]:
# Attach back to fund ids
train_clustered = train_df.copy()
train_clustered["cluster"] = train_clusters

val_clustered = test_df.copy()
val_clustered["cluster"] = val_clusters

test_clustered = val_df.copy()
test_clustered["cluster"] = test_clusters

print("\nTrain cluster summary:")
print(train_clustered.groupby("cluster").mean()['sharpe'])

print("\nTest cluster summary:")
print(test_clustered.groupby("cluster").mean()['sharpe'])

print("\n Val cluster summary:")
print(val_clustered.groupby("cluster").mean()['sharpe'])




Train cluster summary:
cluster
0    0.366106
1    0.314001
Name: sharpe, dtype: float64

Test cluster summary:
cluster
0    0.483628
1    0.362840
Name: sharpe, dtype: float64

 Val cluster summary:
cluster
0    1.250617
1    0.324418
Name: sharpe, dtype: float64


In [47]:
print(train_clustered["cluster"].value_counts())


cluster
0    1568
1     484
Name: count, dtype: int64


In [48]:
cluster_number = 1
outlier_fund = train_clustered[train_clustered["cluster"] == cluster_number]
outlier_fund

Unnamed: 0_level_0,mean_return,std_return,avg_drawdown,mean_std_5,mean_std_10,mean_std_15,std_std_5,std_std_10,std_std_15,sharpe,ret_by_DD,cluster
fund_cnpj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
00.398.561/0001-90,0.002213,0.010604,0.006428,0.005003,0.004758,0.005362,0.004233,0.003662,0.004680,0.208674,0.344269,1
00.524.617/0001-06,0.000043,0.006948,0.011628,0.004639,0.005341,0.006278,0.002527,0.003074,0.004822,0.006135,0.003666,1
00.819.858/0001-82,0.000351,0.000440,0.000024,0.005887,0.005923,0.005382,0.009268,0.006903,0.005466,0.797396,14.903396,1
00.819.880/0001-22,0.000019,0.006996,0.011816,0.004993,0.004578,0.005701,0.003775,0.002784,0.005409,0.002713,0.001606,1
01.063.897/0001-65,0.002873,0.009296,0.002888,0.006821,0.005718,0.005781,0.006128,0.004475,0.003394,0.309120,0.994910,1
...,...,...,...,...,...,...,...,...,...,...,...,...
53.633.558/0001-70,0.002302,0.008503,0.005632,0.005852,0.005811,0.005577,0.007154,0.005085,0.003997,0.270750,0.408792,1
55.234.165/0001-10,0.002461,0.008504,0.005428,0.006039,0.005152,0.004965,0.007144,0.005418,0.004262,0.289438,0.453436,1
63.375.216/0001-51,0.002515,0.007037,0.003674,0.004261,0.005283,0.004618,0.002369,0.006309,0.005074,0.357366,0.684440,1
74.326.471/0001-20,0.000489,0.005725,0.007687,0.004597,0.005591,0.005355,0.002612,0.004965,0.004188,0.085487,0.063668,1
