In [None]:
import os
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:
# Load data dari URL
url = 'https://storage.googleapis.com/anggarin_bucket/dataset/company_information.csv'
data = pd.read_csv(url)

In [None]:
# Bersihkan data numerik
numerical_features = [
    'Revenue (IDR)', 'Gross Profit (IDR)', 'Net Income (IDR)',
    'Market Cap (IDR)', 'Annual EPS', 'Return on Equity (%)',
    '1 Year Price Returns (%)', '3 Year Price Returns (%)',
    '5 Year Price Returns (%)', 'Dividend Yield (%)', 'Payout Ratio (%)'
]
for col in numerical_features:
    data[col] = data[col].replace({',': '', '%': ''}, regex=True).astype(float)

In [None]:
# Gunakan hanya fitur numerik yang relevan
features = data[numerical_features]

# Normalisasi data
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)

# Simpan nama saham untuk referensi
stocks = data['Kode Saham']

In [None]:
# Input layer
input_stock = tf.keras.layers.Input(shape=(features_scaled.shape[1],), name="stock_features")

# Encoder: Mengubah ke 32 dimensi
x = tf.keras.layers.Dense(128, activation='relu')(input_stock)
x = tf.keras.layers.Dense(64, activation='relu')(x)
embedding = tf.keras.layers.Dense(32, activation='relu', name="embedding")(x)

# Decoder: Mengembalikan ke dimensi asli
x = tf.keras.layers.Dense(64, activation='relu')(embedding)
x = tf.keras.layers.Dense(128, activation='relu')(x)
output_reconstructed = tf.keras.layers.Dense(features_scaled.shape[1], activation='linear')(x)

# Model
model = tf.keras.Model(inputs=input_stock, outputs=output_reconstructed)
model.compile(optimizer='adam', loss=tf.keras.losses.MeanSquaredError())
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 stock_features (InputLayer  [(None, 11)]              0         
 )                                                               
                                                                 
 dense (Dense)               (None, 128)               1536      
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 embedding (Dense)           (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 64)                2112      
                                                                 
 dense_3 (Dense)             (None, 128)               8320      
                                                             

In [None]:
# Latih model
model.fit(features_scaled, features_scaled, epochs=100, batch_size=2, verbose=0)

<keras.src.callbacks.History at 0x29c3cce0550>

In [None]:
# Ekstrak embedding
encoder = tf.keras.Model(inputs=input_stock, outputs=embedding)
embeddings = encoder.predict(features_scaled)



In [None]:
# Hitung kemiripan kosinus
similarity_matrix = cosine_similarity(embeddings)

In [None]:
# Evaluasi Autoencoder
reconstructed = model.predict(features_scaled)
mse = tf.keras.losses.MeanSquaredError()(features_scaled, reconstructed).numpy()
print(f"Reconstruction MSE: {mse:.4f}")


Reconstruction MSE: 0.0002


In [None]:
# K-Means Clustering
n_clusters = 5  # Jumlah cluster yang diinginkan
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(embeddings)

# Tambahkan label cluster ke data
data['Cluster'] = clusters

# Hitung silhouette score
sil_score = silhouette_score(embeddings, clusters)
print(f"Silhouette Score: {sil_score:.4f}")

# Hanya gunakan kolom numerik untuk analisis cluster
numerical_cluster_data = data[numerical_features + ['Cluster']]

# Analisis setiap cluster
cluster_summary = numerical_cluster_data.groupby('Cluster').mean()
print("Cluster Summary:")
print(cluster_summary)

  super()._check_params_vs_input(X, default_n_init=10)


Silhouette Score: 0.2033
Cluster Summary:
         Revenue (IDR)  Gross Profit (IDR)  Net Income (IDR)  \
Cluster                                                        
0         1.984337e+13        6.138118e+12      1.873947e+12   
1         1.684212e+14        4.111375e+13      2.476450e+13   
2         6.944706e+12        2.069544e+12      9.141176e+11   
3         1.558536e+13        4.811720e+12      1.371480e+12   
4         1.342967e+13        6.275333e+12      2.752333e+12   

         Market Cap (IDR)   Annual EPS  Return on Equity (%)  \
Cluster                                                        
0            1.868167e+13   263.202632             10.747895   
1            1.943788e+14  1811.407500             17.792500   
2            9.123765e+12   252.703529             17.685882   
3            1.551952e+13   166.085600              6.851200   
4            1.940300e+13   220.940000             16.743333   

         1 Year Price Returns (%)  3 Year Price Returns (%) 

In [None]:
# Fungsi rekomendasi
def recommend_with_model_and_returns_and_data(target_returns, top_n=10):
    """
    target_returns: List atau array [1 Year Return, 3 Year Return, 5 Year Return]
    """
    # Hitung jarak absolut ke semua saham lain berdasarkan return
    return_diff = data[['1 Year Price Returns (%)']].apply(
        lambda x: abs(x.values - target_returns).sum(), axis=1
    )

    # Ekstrak embedding dari model
    embeddings = encoder.predict(features_scaled)

    # Hitung rata-rata embedding target return (proyeksi ke ruang embedding)
    avg_target_embedding = embeddings.mean(axis=0)

    # Hitung jarak kosinus antara proyeksi target dan semua saham
    similarity_scores = cosine_similarity([avg_target_embedding], embeddings).flatten()

    # Skor gabungan: Pertimbangkan kemiripan embedding dan return (bobot seimbang)
    combined_score = similarity_scores - 0.1 * return_diff  # Bobot 0.1 untuk jarak return

    # Urutkan dan ambil top-n rekomendasi
    similar_indices = combined_score.argsort()[::-1][:top_n]
    similar_stocks = stocks.iloc[similar_indices]
    similar_scores = combined_score[similar_indices]

    # Ambil data saham terkait untuk rekomendasi
    recommendations = []
    for idx, score in zip(similar_indices, similar_scores):
        stock_data = data.iloc[idx][['Kode Saham', 'Revenue (IDR)', 'Gross Profit (IDR)', 'Net Income (IDR)',
                                     'Market Cap (IDR)', 'Annual EPS', 'Return on Equity (%)',
                                     '1 Year Price Returns (%)', '3 Year Price Returns (%)', '5 Year Price Returns (%)',
                                     'Dividend Yield (%)', 'Payout Ratio (%)']]
        recommendations.append((stock_data, score))

    return recommendations

# Contoh penggunaan
target_returns = [6]  # Misal return target
result = recommend_with_model_and_returns_and_data(target_returns, top_n=10)

# Tampilkan hasil dengan data saham
for stock_info, score in result:
    print(f"Stock: {stock_info['Kode Saham']}, Similarity Score: {score}")
    print(stock_info[['Kode Saham', 'Revenue (IDR)', 'Gross Profit (IDR)', 'Net Income (IDR)', 
                      'Market Cap (IDR)', 'Annual EPS', 'Return on Equity (%)',
                      '1 Year Price Returns (%)', '3 Year Price Returns (%)', '5 Year Price Returns (%)',
                      'Dividend Yield (%)', 'Payout Ratio (%)']])
    print("-" * 50)

Stock: CLPI, Similarity Score: 0.9201551885604858
Kode Saham                            CLPI
Revenue (IDR)               892000000000.0
Gross Profit (IDR)          119000000000.0
Net Income (IDR)             53000000000.0
Market Cap (IDR)            323000000000.0
Annual EPS                          171.38
Return on Equity (%)                  9.56
1 Year Price Returns (%)              6.03
3 Year Price Returns (%)              1.93
5 Year Price Returns (%)             38.82
Dividend Yield (%)                   12.13
Payout Ratio (%)                     74.84
Name: 20, dtype: object
--------------------------------------------------
Stock: INCI, Similarity Score: 0.8752937984466553
Kode Saham                            INCI
Revenue (IDR)               377000000000.0
Gross Profit (IDR)           81000000000.0
Net Income (IDR)             19000000000.0
Market Cap (IDR)            120000000000.0
Annual EPS                          106.92
Return on Equity (%)                  4.31
1 Year P

In [None]:
# Skala ke 0-100 untuk akurasi rekonstruksi
accuracy = max(0, 100 - mse * 100)  # Asumsi semakin rendah MSE semakin baik
print(f"Model Accuracy (scaled): {accuracy:.2f}%")

Model Accuracy (scaled): 99.98%


In [None]:
# Buat folder jika belum ada
folder_name = "../trainingModel"
os.makedirs(folder_name, exist_ok=True)

# Path lengkap untuk menyimpan model
model_path = os.path.join(folder_name, "stock_recommendation_model.h5")

# Simpan model ke folder
model.save(model_path)
print(f"Model saved to: {model_path}")

Model saved to: ../trainingModel\stock_recommendation_model.h5


  saving_api.save_model(


In [None]:
# import json
# from sklearn.preprocessing import MinMaxScaler

# # Simpan scaler sebagai file JSON
# scaler_data = {
#     'min': scaler.data_min_.tolist(),
#     'scale': scaler.scale_.tolist()
# }

# with open('scaler.json', 'w') as f:
#     json.dump(scaler_data, f)


In [None]:
# # Memuat kembali model dari folder
# loaded_model = tf.keras.models.load_model(model_path)
# print("Model loaded successfully.")

# # Ekstrak encoder dari model yang dimuat
# loaded_encoder = tf.keras.Model(inputs=loaded_model.input, outputs=loaded_model.get_layer("embedding").output)

# # Mendapatkan kembali embedding menggunakan encoder yang dimuat
# loaded_embeddings = loaded_encoder.predict(features_scaled)

# # Hitung ulang kemiripan kosinus dengan model yang dimuat
# loaded_similarity_matrix = cosine_similarity(loaded_embeddings)

In [None]:
# # Fungsi rekomendasi menggunakan model yang dimuat
# def recommend_from_loaded_model(stock_name, top_n=3):
#     idx = stocks[stocks == stock_name].index[0]
    
#     # Ambil skor kemiripan untuk saham tersebut
#     similarity_scores = loaded_similarity_matrix[idx]
    
#     # Urutkan berdasarkan skor (kecuali saham itu sendiri)
#     similar_indices = similarity_scores.argsort()[::-1][1:top_n+1]
#     similar_stocks = stocks.iloc[similar_indices]
#     similar_scores = similarity_scores[similar_indices]
    
#     # Gabungkan hasil (nama saham dan skor)
#     recommendations = list(zip(similar_stocks, similar_scores))
#     return recommendations

In [None]:
# # Contoh penggunaan
# result = recommend_from_loaded_model('AALI', top_n=3)
# print(result)

In [None]:
# # Mendapatkan prediksi rekonstruksi dari model
# predicted_features = loaded_model.predict(features_scaled)

# # Menghitung MAE antara input asli dan rekonstruksi
# mae = mean_absolute_error(features_scaled, predicted_features)

# print(f"Mean Absolute Error (MAE) of the model: {mae:.4f}")