In [None]:
# !pip install tensorflow_text
# !pip install tensorflow_hub

In [None]:
import pandas as pd


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('major_data_cleaned_300.csv')
df['combined_features'] = df['Description'] + ' ' + df['Mata Kuliah']

# Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['combined_features'])

# Converting text to sequences
sequences = tokenizer.texts_to_sequences(df['combined_features'])
padded_sequences = pad_sequences(sequences, maxlen=100)

# Target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['NAMA'])


In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, y_encoded, test_size=0.2, random_state=42)
y_train = np.array(y_train)
y_val = np.array(y_val)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, GlobalAveragePooling1D, LayerNormalization, MultiHeadAttention, Dropout

# Transformer Encoder Layer
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [Dense(ff_dim, activation="relu"), Dense(embed_dim)]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# Building the Model
def create_model(vocab_size, embed_dim, num_heads, ff_dim, num_classes):
    inputs = Input(shape=(100,))
    embedding_layer = Embedding(vocab_size, embed_dim)
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = transformer_block(x)
    x = GlobalAveragePooling1D()(x)
    x = Dropout(0.1)(x)
    outputs = Dense(num_classes, activation='softmax')(x)

    model = Model(inputs=inputs, outputs=outputs)
    return model

# Model parameters
vocab_size = 5000
embed_dim = 128
num_heads = 4
ff_dim = 128
num_classes = len(df['NAMA'].unique())

model = create_model(vocab_size, embed_dim, num_heads, ff_dim, num_classes)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding (Embedding)       (None, 100, 128)          640000    
                                                                 
 transformer_block (Transfo  (None, 100, 128)          297344    
 rmerBlock)                                                      
                                                                 
 global_average_pooling1d (  (None, 128)               0         
 GlobalAveragePooling1D)                                         
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 294)               37926 

In [None]:
y_val

array([246, 165,   5,  51, 217, 119,  87, 190,  98, 221,  50, 131, 253,
       132, 255, 276, 104, 162,  63,  59,  79, 212, 145, 252, 278,  45,
        82,   2, 161,  30, 260, 271, 108,  76, 263, 188, 216, 207, 280,
       209, 121, 229,   8,  93, 289,  75, 191, 186,  26,  88, 248, 273,
       167, 151, 222,  29,   9, 155, 227])

In [None]:
history = model.fit(
    X_train, y_train,
    epochs=200,
    validation_data=(X_val, y_val),
    batch_size=16
)

# Save label tokenizer for inference
# import pickle
# with open('label_tokenizer.pkl', 'wb') as f:
#     pickle.dump(label_seq, f)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [None]:
def recommend_courses(input_text, model, tokenizer, label_encoder, top_n=10):
    seq = tokenizer.texts_to_sequences([input_text])
    padded_seq = pad_sequences(seq, maxlen=100)
    predictions = model.predict(padded_seq)
    top_indices = predictions[0].argsort()[-top_n:][::-1]
    recommendations = label_encoder.inverse_transform(top_indices)
    return recommendations.tolist()

input_text = "ahli kimia"
top_recommendations = recommend_courses(input_text, model, tokenizer, label_encoder)
print("Top-10 recommended courses:", top_recommendations)


Top-10 recommended courses: ['KEWIRAUSAHAAN', 'PEND. MATEMATIKA', 'TEKNIK INFORMATIKA', 'INFORMATIKA', 'PENDIDIKAN GURU SD', 'TEKNOLOGI INDUSTRI PERTANIAN', 'SENI KULINER DAN PENGOLAHAN JASA MAKANAN', 'TATA BOGA', 'ILMU PENDIDIKAN AGAMA ISLAM', 'SASTRA JEPANG']
