In [None]:
import os
import shutil
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
#import tensorflow_hub as hub
#import tensorflow_text as text

print("TF Version: ", tf.__version__)

In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

data_loc = "C:\\Users\\mgf-l\\Desktop\\structured_data.xlsx"
df = pd.read_excel(data_loc)
    
df = df.dropna(subset=['category', 'title'])


df = df[['category', 'title']]


X = df['title']   
y = df['category']  


train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42) 

train_dataset = tf.data.Dataset.from_tensor_slices((train_X, train_y))
test_dataset = tf.data.Dataset.from_tensor_slices((test_X, test_y))
train_dataset


In [None]:
for example, label in train_dataset.take(1):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_X)  # Fit only to training data

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_X)
test_sequences = tokenizer.texts_to_sequences(test_X)

# Pad sequences
max_length = max(len(x) for x in train_sequences)  # You can set a specific max_length if you prefer
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')


In [None]:
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

encoder = LabelEncoder()
encoder.fit(train_y)  # Fit encoder on the training labels

# Transform categories to integers
train_labels = encoder.transform(train_y)
test_labels = encoder.transform(test_y)

# Convert to one-hot encoding
train_labels_one_hot = tf.keras.utils.to_categorical(train_labels)
test_labels_one_hot = tf.keras.utils.to_categorical(test_labels)


In [None]:
scrape_df = pd.read_excel("C:\\Users\\mgf-l\\Desktop\\structured_data.xlsx", sheet_name = "scraped")

In [None]:
for example, label in train_dataset.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=10000, output_dim=16, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(len(encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])



In [None]:
history = model.fit(train_padded, train_labels_one_hot, epochs=30, validation_data=(test_padded, test_labels_one_hot))


In [None]:
history2 = model.fit(train_padded, train_labels_one_hot, epochs=40, validation_data=(test_padded, test_labels_one_hot))

In [None]:
loss, accuracy = model.evaluate(test_padded, test_labels_one_hot)
print(f"Test accuracy: {accuracy}")


predictions = model.predict(test_padded)
predicted_categories = encoder.inverse_transform([np.argmax(p) for p in predictions])


In [None]:


scrape_df = pd.read_excel("C:\\Users\\mgf-l\\Desktop\\structured_data.xlsx", sheet_name="scraped")

scrape_titles = scrape_df['title']

scrape_sequences = tokenizer.texts_to_sequences(scrape_titles)
scrape_padded = pad_sequences(scrape_sequences, maxlen=max_length, padding='post')

scrape_predictions = model.predict(scrape_padded)
scrape_predicted_categories = encoder.inverse_transform([np.argmax(p) for p in scrape_predictions])



scrape_df['Predicted Category'] = scrape_predicted_categories

scrape_df.to_excel("C:\\Users\\mgf-l\\Desktop\\title_prediction_output.xlsx", sheet_name="scraped_results", index=False)


In [None]:
model.summary()

In [None]:
from tensorflow.keras.utils import plot_model

# Assuming 'model' is your Keras model
plot_model(model, to_file='C:\\Users\\mgf-l\\Desktop\\model_plot.png', show_shapes=True, show_layer_names=True)


In [None]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs")



In [None]:
model.fit(train_padded, train_labels_one_hot, epochs=30, validation_data=(test_padded, test_labels_one_hot), callbacks=[tensorboard_callback])