In [1]:
pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.6-py3-none-any.whl (128 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/128.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m122.9/128.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.9/128.9 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.6 kt-legacy-1.0.5


In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import zipfile
import os
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
import kerastuner as kt
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.optimizers import Adam


train_data_file =  'train_data.txt'

# Loading the training data
train_data = pd.read_csv(train_data_file, sep=":::", header=None, names=["ID", "Title", "Genre", "Description"], engine='python')

# Preprocessing
train_data['cleaned_description'] = train_data['Description'].str.lower().str.replace("[^a-z0-9 ]", "", regex=True)
label_encoder = LabelEncoder()
train_data['genre_label'] = label_encoder.fit_transform(train_data['Genre'].str.strip())

# Tokenization and Padding
max_num_words = 5000
max_sequence_length = 150
tokenizer = Tokenizer(num_words=max_num_words)
tokenizer.fit_on_texts(train_data['cleaned_description'])
sequences = tokenizer.texts_to_sequences(train_data['cleaned_description'])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# Splitting the data
test_val_size = 0.4
test_size = 0.5
X_train, X_test_val, y_train, y_test_val = train_test_split(padded_sequences, train_data['genre_label'], test_size=test_val_size, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=test_size, random_state=42)

# Model building function for hyperparameter tuning
def build_cnn_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=hp.Int('embedding_dim', min_value=32, max_value=128, step=32), input_length=max_sequence_length))
    model.add(Conv1D(filters=hp.Int('filters', min_value=32, max_value=128, step=32), kernel_size=hp.Choice('kernel_size', values=[3, 5, 7]), activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer=Adam(hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Hyperparameter tuning
tuner = kt.RandomSearch(build_cnn_model, objective='val_accuracy', max_trials=10, executions_per_trial=1, directory='cnn_tuning', project_name='keras_tuner_cnn')
tuner.search(X_train, y_train, epochs=5, validation_data=(X_val, y_val), callbacks=[EarlyStopping(monitor='val_accuracy', patience=2)])

# The best model
best_model = tuner.get_best_models(num_models=1)[0]
best_model.fit(X_train, y_train, batch_size=32, epochs=5, validation_data=(X_val, y_val))

# Evaluation
loss, accuracy = best_model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")

Trial 10 Complete [00h 04m 59s]
val_accuracy: 0.4372406303882599

Best val_accuracy So Far: 0.5417320132255554
Total elapsed time: 01h 12m 29s
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 0.5106520056724548
