### 1. Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from transformers import pipeline
import joblib

TypeError: Unable to convert function return value to a Python type! The signature was
	() -> handle

### 2. Loading the Dataset

In [None]:
data = pd.read_csv('data_cleaned.csv')
data = data[['review_cleaned_joined']].dropna()

### 3. Zero-Shot Classification to Assign Initial Labels

In [None]:
def zero_shot_classification(texts, labels):
    classifier = pipeline("zero-shot-classification")
    predictions = []
    for text in texts:
        result = classifier(text, candidate_labels=labels)
        predictions.append(result['labels'][0])  # Take the top predicted label
    return predictions

candidate_labels = ['Pricing', 'Coverage', 'Enrollment', 'Customer Service', 'Claims Processing', 'Cancellation']
data['category'] = zero_shot_classification(data['review_cleaned_joined'], candidate_labels)

### 4. Saving Labeled Data and Label Encoder

In [None]:
label_encoder = LabelEncoder()
data['category_encoded'] = label_encoder.fit_transform(data['category'])

# Save labeled data for Streamlit app
data.to_csv('labeled_data.csv', index=False)
joblib.dump(label_encoder, 'label_encoder.pkl')

### 5. Splitting Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['review_cleaned_joined'], data['category_encoded'], test_size=0.2, random_state=42)

### 6. Tokenizing and Padding Text

In [None]:
# Tokenize Texts
vocab_size = 10000
max_length = 100
oov_token = "<OOV>"

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = tf.keras.preprocessing.sequence.pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_padded = tf.keras.preprocessing.sequence.pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# Save tokenizer for Streamlit app
joblib.dump(tokenizer, 'tokenizer.pkl')

### 7. Building the Model

In [None]:
embedding_dim = 16

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    GlobalAveragePooling1D(),
    Dense(16, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')
])

### 8. Compiling and Training the Model

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(
    X_train_padded, y_train, epochs=10, validation_data=(X_test_padded, y_test), verbose=2
)

# Save model for Streamlit app
model.save('text_classification_model.h5')

### 9. Model Evaluation

In [None]:
loss, accuracy = model.evaluate(X_test_padded, y_test, verbose=0)
print(f"Test Accuracy: {accuracy:.2f}")

### 10. Generating a Classification Report

In [None]:
y_pred = model.predict(X_test_padded)
y_pred_classes = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_))

# Save classification report for Streamlit app
classification_report_dict = classification_report(
    y_test, y_pred_classes, target_names=label_encoder.classes_, output_dict=True
)
pd.DataFrame(classification_report_dict).transpose().to_csv('classification_report.csv')

### 11. Embedding Visualization

In [None]:
log_dir = "logs/embedding"
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)

model.fit(
    X_train_padded, y_train, epochs=1, validation_data=(X_test_padded, y_test), verbose=2,
    callbacks=[tensorboard_callback]
)