In [2]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import numpy as np

print(tf.__version__)

# Step 1: Prepare directories and create varied dummy data
texts = []
labels = []

base_dir = "data"
categories = ["technology", "sports", "finance", "design_arts", "engineering", "health_medicine", "volunteering", "career_advice", "entrepreneurship", "internships_jobs", "study"]

category_words = {
    "technology": ["computer", "smartphone", "app", "software", "hardware", "internet", "AI", "robot", "gadget", "coding", "programming", "blockchain", "cybersecurity", "data", "cloud"],
    "sports": ["cricket", "football", "basketball", "tennis", "swimming", "running", "match", "game", "player", "team", "coach", "stadium", "olympics", "fitness", "exercise"],
    "finance": ["stock", "market", "investment", "bond", "currency", "bank", "loan", "budget", "economy", "trade", "saving", "cost", "planning", "retirement", "crypto"],
    "design_arts": ["art", "design", "painting", "sculpture", "graphic", "architecture", "fashion", "drawing", "photography", "illustration", "creative", "museum", "gallery"],
    "engineering": ["engineer", "build", "machine", "machine", "circuit", "bridge", "robotics", "mechanical", "electrical", "civil", "chemical", "aerospace", "software", "project"],
    "health_medicine": ["health", "medicine", "doctor", "hospital", "disease", "treatment", "fitness", "nutrition", "exercise", "wellness", "mental", "vaccine", "surgery", "diet"],
    "volunteering": ["volunteer", "help", "community", "charity", "donate", "service", "aid", "support", "nonprofit", "event", "cause", "impact", "organization"],
    "career_advice": ["career", "job", "resume", "interview", "promotion", "salary", "skill", "networking", "mentorship", "development", "goal", "advice", "path"],
    "entrepreneurship": ["startup", "business", "entrepreneur", "idea", "venture", "innovation", "funding", "market", "product", "growth", "strategy", "leadership"],
    "internships_jobs": ["internship", "job", "application", "position", "hiring", "employer", "experience", "opportunity", "role", "company", "training", "placement"],
    "study": ["study", "learn", "education", "school", "university", "exam", "homework", "course", "knowledge", "research", "book", "lecture", "degree"]
}

templates = [
    "I am interested in {word}",
    "Discussing {word} with friends",
    "My experience with {word}",
    "Attended an event about {word}",
    "Learned new things on {word}",
    "Shared thoughts on {word}",
    "The importance of {word}",
    "How to get started with {word}",
    "Tips for {word}",
    "Challenges in {word}",
    "Latest news on {word}",
    "My favorite {word}",
    "Exploring {word}",
    "Questions about {word}"
]

for category in categories:
    os.makedirs(os.path.join(base_dir, category), exist_ok=True)

for category in categories:
    csv_path = os.path.join(base_dir, category, f"{category}_posts_1000.csv")
    words = category_words[category]
    posts = []
    for i in range(1000):
        template = np.random.choice(templates)
        word = np.random.choice(words)
        post = template.format(word=word)
        posts.append(post)
    dummy_data = {'post': posts}
    dummy_df = pd.DataFrame(dummy_data)
    dummy_df.to_csv(csv_path, index=False)

print("Varied dummy data directories and files created.")

# Step 2: Load data from CSV files
for idx, category in enumerate(categories):
    csv_path = os.path.join(base_dir, category, f"{category}_posts_1000.csv")
    df = pd.read_csv(csv_path)
    for text in df["post"]:
        texts.append(text)
        labels.append(idx)

print(f"Loaded {len(texts)} texts with {len(set(labels))} labels")

# Step 3: Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, stratify=labels, random_state=42
)

# Step 4: Set parameters and create TextVectorization layer
max_features = 10000  # Size of vocabulary
sequence_length = 50  # Max number of words per sample

vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length
)

# Step 5: Adapt vectorizer and prepare datasets
vectorize_layer.adapt(train_texts)

train_texts_ds = tf.data.Dataset.from_tensor_slices((train_texts, train_labels))
test_texts_ds = tf.data.Dataset.from_tensor_slices((test_texts, test_labels))

def vectorize_text(text, label):
    return vectorize_layer(text), label

train_ds = train_texts_ds.map(vectorize_text)
test_ds = test_texts_ds.map(vectorize_text)

batch_size = 32

train_ds = train_ds.shuffle(10000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

# Step 6: Build and compile the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_features + 1, 16),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(11)  # For 11 classes
])

model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy']
)

# Step 7: Train the model
history = model.fit(
    train_ds,
    validation_data=test_ds,
    epochs=10
)

# Step 8: Create export model with softmax
export_model = tf.keras.Sequential([
    vectorize_layer,
    model,
    tf.keras.layers.Activation('softmax')
])

export_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer="adam",
    metrics=["accuracy"]
)

# Step 9: Define prediction function
def predict_category(text):
    input_tensor = tf.constant([text])
    probs = export_model.predict(input_tensor)[0]

    predicted_index = np.argmax(probs)
    predicted_category = categories[predicted_index]

    prob_percentages = {cat: f"{prob * 100:.2f}%" for cat, prob in zip(categories, probs)}

    print(f"Predicted Category: {predicted_category}")
    print("Probabilities:")
    for cat, perc in prob_percentages.items():
        print(f" - {cat}: {perc}")

# Step 10: Test predictions
print(predict_category("Played a great match of cricket with friends in the main ground."))

print(predict_category("Attended event on Budget planning cost saving as a student"))

# Step 11: Prepare for TFLite conversion
@tf.function(input_signature=[tf.TensorSpec(shape=[None, 50], dtype=tf.int32)])
def prob_model_wrapper(input):
    logits = model(input)
    return tf.nn.softmax(logits)

concrete_func = prob_model_wrapper.get_concrete_function()

converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
tflite_model = converter.convert()

# Step 12: Save TFLite model
with open("model_with_softmax.tflite", "wb") as f:
    f.write(tflite_model)

# Step 13: Load and test TFLite model
interpreter = tf.lite.Interpreter(model_path="model_with_softmax.tflite")
interpreter.allocate_tensors()

input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

print("Input shape:", input_details[0]['shape'])
print("Input dtype:", input_details[0]['dtype'])

input_index = input_details[0]['index']
output_index = output_details[0]['index']

sample_text = "Attended event on Budget planning cost saving as a student"
vec = vectorize_layer(tf.constant([sample_text]))  # shape (1, 50)
vec = tf.cast(vec, tf.int32)

interpreter.set_tensor(input_index, vec.numpy())
interpreter.invoke()
probs = interpreter.get_tensor(output_index)[0]

for cat, prob in zip(categories, probs):
    print(f"{cat}: {prob * 100:.2f}%")

print("Predicted Category:", categories[np.argmax(probs)])

# Step 14: Save vocabulary and labels
vocab = vectorize_layer.get_vocabulary()

with open("vocab.txt", "w") as f:
    for token in vocab:
        f.write(token + "\n")

with open("labels.txt", "w") as f:
    for label in categories:
        f.write(label + "\n")

2.19.0
Varied dummy data directories and files created.
Loaded 11000 texts with 11 labels
Epoch 1/10
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.0953 - loss: 2.3978 - val_accuracy: 0.0909 - val_loss: 2.3908
Epoch 2/10
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.1248 - loss: 2.3895 - val_accuracy: 0.1423 - val_loss: 2.3801
Epoch 3/10
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.2054 - loss: 2.3763 - val_accuracy: 0.2836 - val_loss: 2.3618
Epoch 4/10
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.2305 - loss: 2.3564 - val_accuracy: 0.3800 - val_loss: 2.3318
Epoch 5/10
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.3712 - loss: 2.3226 - val_accuracy: 0.2864 - val_loss: 2.2880
Epoch 6/10
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.

'NoneType' object has no attribute 'name'


Predicted Category: finance
Probabilities:
 - technology: 3.64%
 - sports: 6.00%
 - finance: 57.62%
 - design_arts: 2.57%
 - engineering: 0.96%
 - health_medicine: 1.48%
 - volunteering: 7.81%
 - career_advice: 6.59%
 - entrepreneurship: 4.23%
 - internships_jobs: 3.73%
 - study: 5.36%
None
Input shape: [ 1 50]
Input dtype: <class 'numpy.int32'>
technology: 3.64%
sports: 6.00%
finance: 57.62%
design_arts: 2.57%
engineering: 0.96%
health_medicine: 1.48%
volunteering: 7.81%
career_advice: 6.59%
entrepreneurship: 4.23%
internships_jobs: 3.73%
study: 5.36%
Predicted Category: finance


    TF 2.20. Please use the LiteRT interpreter from the ai_edge_litert package.
    See the [migration guide](https://ai.google.dev/edge/litert/migration)
    for details.
    
