In [None]:
pip install keras-tuner



In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import zipfile
import os
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
import kerastuner as kt
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.callbacks import EarlyStopping

train_data_file =  'train_data.txt'

# Loading the training data
train_data = pd.read_csv(train_data_file, sep=":::", header=None, names=["ID", "Title", "Genre", "Description"], engine='python')

# Preprocessing
train_data['cleaned_description'] = train_data['Description'].str.lower().str.replace("[^a-z0-9 ]", "", regex=True)
label_encoder = LabelEncoder()
train_data['genre_label'] = label_encoder.fit_transform(train_data['Genre'].str.strip())

# Tokenization and Padding
max_num_words = 5000  # Reduced vocabulary size
max_sequence_length = 150  # Reduced sequence length
tokenizer = Tokenizer(num_words=max_num_words)
tokenizer.fit_on_texts(train_data['cleaned_description'])
sequences = tokenizer.texts_to_sequences(train_data['cleaned_description'])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# Splitting the data
test_val_size = 0.4
test_size = 0.5
X_train, X_test_val, y_train, y_test_val = train_test_split(padded_sequences, train_data['genre_label'], test_size=test_val_size, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=test_size, random_state=42)

# Building a simpler RNN model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 50  # Reduced embedding dimension
lstm_units = 64  # Reduced number of LSTM units

def build_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))
    model.add(LSTM(units=hp.Choice('units', values=[32, 64]), dropout=0.2))
    model.add(Dense(len(np.unique(y_train)), activation='softmax'))
    model.compile(optimizer=Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3])),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

# Hyperparameter tuning with constraints
tuner = kt.RandomSearch(build_model,
                        objective='val_accuracy',
                        max_trials=10,  # Reduced number of trials
                        executions_per_trial=1,
                        directory='my_dir',
                        project_name='intro_to_kt')

tuner.search(X_train, y_train, epochs=5,  # Reduced epochs
             validation_data=(X_val, y_val),
             callbacks=[EarlyStopping(monitor='val_accuracy', patience=2)])  # Using TensorFlow's EarlyStopping

# Retrieve the best model
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
best_model = tuner.hypermodel.build(best_hps)
best_model.fit(X_train, y_train, epochs=5, validation_data=(X_val, y_val))  # Reduced epochs

# Classification report
y_pred_prob = best_model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Trial 4 Complete [00h 15m 26s]
val_accuracy: 0.5224568843841553

Best val_accuracy So Far: 0.5330628156661987
Total elapsed time: 01h 02m 12s
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

           0       0.29      0.13      0.18       266
           1       0.50      0.02      0.04        97
           2       0.35      0.08      0.13       144
           3       0.00      0.00      0.00       118
           4       0.00      0.00      0.00        64
           5       0.44      0.60      0.51      1495
           6       0.00      0.00      0.00       107
           7       0.67      0.82      0.74      2607
           8       0.56      0.68      0.61      2790
           9       0.00      0.00      0.00       147
          10       0.00      0.00      0.00        67
          11       0.37      0.28      0.31        40
          12       0.00      0.00      0.00        51
          13       0.50      0.60      0.55       4

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
