In [2]:
# Genereate a new file with no punctuation

import os
import re
import string
import pandas as pd

def process_folder(folder_path, label):
    """Reads all text files in a folder, cleans lines, removes punctuation, and returns labeled data."""
    cleaned_data = []
    
    for file in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file)

        with open(file_path, "r", encoding="utf-8") as f:
            lines = f.read()
        
        # Remove "SampleX: " in any case (Sample, sample, SAMPLE, etc.)
        cleaned_text = re.sub(r"(?i)\bsample\d+:\s*", "", lines).strip()
        
        # Split into lines, remove punctuation, and store non-empty ones with labels
        for line in cleaned_text.split("\n"):
            if line.strip():
                # Remove all punctuation
                line_no_punct = line.translate(str.maketrans('', '', string.punctuation))
                cleaned_data.append((line_no_punct.strip(), label))
    
    return cleaned_data

# Define folder paths
addressing_folder = r"D:\main project\Addressing"
non_addressing_folder = r"D:\main project\Non Addressing"

# Process both folders
addressing_data = process_folder(addressing_folder, 0)  # Label 0 for Addressing
non_addressing_data = process_folder(non_addressing_folder, 1)  # Label 1 for Non-Addressing

# Combine and create DataFrame
data = addressing_data + non_addressing_data
df = pd.DataFrame(data, columns=["Text", "Label"])

# Shuffle the DataFrame for variety
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the first few rows
print(df.head())

# Save to CSV (optional)
df.to_csv("cleaned_mixed_data_no_punct.csv", index=False, encoding="utf-8")
df

                                                Text  Label
0  Ive been working on improving my photography s...      1
1  Have you seen the new exhibit at the art museu...      1
2  I noticed that my email account is running out...      0
3  Today I received some news that made my day un...      1
4  Ive been meaning to get into hiking more often...      1


Unnamed: 0,Text,Label
0,Ive been working on improving my photography s...,1
1,Have you seen the new exhibit at the art museu...,1
2,I noticed that my email account is running out...,0
3,Today I received some news that made my day un...,1
4,Ive been meaning to get into hiking more often...,1
...,...,...
7995,I recently joined a book club and its been a w...,1
7996,The art of storytelling is a powerful tool for...,1
7997,Can you help me set up a reminder for my docto...,0
7998,I can’t believe it’s already been a year since...,1


In [None]:
# Training Code for the Model

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Define EarlyStopping
early_stopping = EarlyStopping(
    monitor="val_loss",  # Monitor validation loss
    patience=3,          # Stop after 3 epochs of no improvement
    restore_best_weights=True  # Restore the best model weights
)

# Load data
def load_data(filepath):
    df = pd.read_csv(filepath)
    # Assuming your CSV has columns 'text' and 'label'
    # If different, adjust the column names below
    texts = df['Text'].values
    labels = df['Label'].values
    return texts, labels

# Preprocess the text
def preprocess_data(texts, labels, max_words=10000, max_sequence_length=1000, validation_split=0.2):
    # Tokenize text
    tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(texts)

    # Convert text to sequences
    sequences = tokenizer.texts_to_sequences(texts)

    # Pad sequences
    padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

    # Split data
    X_train, X_val, y_train, y_val = train_test_split(
        padded_sequences, labels, test_size=validation_split, random_state=42
    )

    return X_train, X_val, y_train, y_val, tokenizer

# Build RNN model
def build_model(vocab_size, embedding_dim=128, max_sequence_length=1000):
    model = Sequential([
        # Embedding layer
        Embedding(vocab_size, embedding_dim, input_length=max_sequence_length),

        # Bidirectional LSTM layers
        Bidirectional(LSTM(64, return_sequences=True)),
        Dropout(0.2),
        Bidirectional(LSTM(32)),
        Dropout(0.2),

        # Output layer
        Dense(1, activation='sigmoid')
    ])

    # Compile model
    model.compile(
        loss='binary_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )

    return model

# Train model
def train_model(model, X_train, y_train, X_val, y_val, epochs=10, batch_size=32):
    history = model.fit(
        X_train, y_train,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(X_val, y_val),
        verbose=1,
        callbacks = [early_stopping]
    )
    return history

# Evaluate model
def evaluate_model(model, X_val, y_val):
    # Get predictions
    y_pred_probs = model.predict(X_val)
    y_pred = (y_pred_probs > 0.5).astype(int).flatten()

    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred, target_names=['Addressing Robot (0)', 'Not Addressing Robot (1)']))

    # Print confusion matrix
    cm = confusion_matrix(y_val, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Addressing Robot (0)', 'Not Addressing Robot (1)'],
                yticklabels=['Addressing Robot (0)', 'Not Addressing Robot (1)'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

    return y_pred

# Plot training history
def plot_history(history):
    plt.figure(figsize=(12, 5))

    # Plot accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='lower right')

    # Plot loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper right')

    plt.tight_layout()
    plt.show()

# Example usage
def main():
    # Configuration
    csv_filepath = '/content/cleaned_mixed_data_no_punct.csv'  # Replace with your CSV file path
    max_words = 10000
    max_sequence_length = 100
    embedding_dim = 128
    epochs = 4
    batch_size = 32
    validation_split = 0.2

    # Load and preprocess data
    print("Loading and preprocessing data...")
    texts, labels = load_data(csv_filepath)
    X_train, X_val, y_train, y_val, tokenizer = preprocess_data(
        texts, labels, max_words, max_sequence_length, validation_split
    )

    # Build model
    print("Building model...")
    vocab_size = min(max_words, len(tokenizer.word_index) + 1)
    model = build_model(vocab_size, embedding_dim, max_sequence_length)
    model.summary()

    # Train model
    print("Training model...")
    history = train_model(model, X_train, y_train, X_val, y_val, epochs, batch_size)

    # Evaluate model
    print("Evaluating model...")
    y_pred = evaluate_model(model, X_val, y_val)

    # Plot training history
    plot_history(history)

    # Save model
    model.save('robot_addressing_classifier.h5')
    print("Model saved as 'robot_addressing_classifier.h5'")

    # Save tokenizer
    import pickle
    with open('tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("Tokenizer saved as 'tokenizer.pickle'")

    return model, tokenizer

if __name__ == "__main__":
    main()

In [1]:
import tensorflow as tf
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the model and tokenizer
model = tf.keras.models.load_model('robot_addressing_classifier.h5')

# Load the tokenizer
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

def classify_text(text, max_sequence_length=100):
    """
    Classify a single text input to determine if it's addressing a robot.
    
    Args:
        text: Text string to classify
        max_sequence_length: Maximum length for padding (should match training)
        
    Returns:
        Dictionary with prediction results
    """
    # Convert to sequence
    sequences = tokenizer.texts_to_sequences([text])
    
    # Pad sequence
    padded_sequence = pad_sequences(
        sequences,
        maxlen=max_sequence_length,
        padding='post'
    )
    
    # Make prediction
    prediction_prob = model.predict(padded_sequence)[0][0]
    predicted_class = 1 if prediction_prob > 0.5 else 0
    
    # Return result
    is_addressing_robot = (predicted_class == 0)
    
    return {
        'text': text,
        'is_addressing_robot': is_addressing_robot,
        'confidence': float(max(prediction_prob, 1 - prediction_prob))
    }

# Example usage
if __name__ == "__main__":
    # Test with different examples
    test_examples = [
        "Hey robot, what's the weather today?",
        "I need to finish my homework soon.",
        "Robot, can you help me with this?",
        "The meeting starts at 2 PM."
    ]
    
    for text in test_examples:
        result = classify_text(text)
        status = "IS" if result['is_addressing_robot'] else "is NOT"
        print(f"Text: \"{text}\"")
        print(f"Result: This {status} addressing the robot")
        print(f"Confidence: {result['confidence']:.2f}")
        print()



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Text: "Hey robot, what's the weather today?"
Result: This IS addressing the robot
Confidence: 1.00

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
Text: "I need to finish my homework soon."
Result: This is NOT addressing the robot
Confidence: 1.00

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
Text: "Robot, can you help me with this?"
Result: This IS addressing the robot
Confidence: 1.00

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
Text: "The meeting starts at 2 PM."
Result: This is NOT addressing the robot
Confidence: 1.00



In [8]:
classify_text("The Eiffel Tower is one of the most iconic landmarks in the world, located in Paris, France. It was designed by Gustave Eiffel and completed in 1889 as the entrance arch for the 1889 Exposition Universelle (World’s Fair).What do you think.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step


{'text': 'The Eiffel Tower is one of the most iconic landmarks in the world, located in Paris, France. It was designed by Gustave Eiffel and completed in 1889 as the entrance arch for the 1889 Exposition Universelle (World’s Fair).What do you think.',
 'is_addressing_robot': True,
 'confidence': 0.9999315142631531}

In [16]:
classify_text("I am an intelligent boy. I am a good student. I am a good citizen. I am a good person. I am a good human being.What do you think")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step


{'text': 'I am an intelligent boy. I am a good student. I am a good citizen. I am a good person. I am a good human being.What do you think',
 'is_addressing_robot': True,
 'confidence': 0.9999369382858276}