# Importing Libraries

In [11]:
! pip install wn

import re
import json
import numpy as np
import pandas as pd
import math
import random
import wn
from collections import Counter, defaultdict



# Downloading Pre-Processing Data

In [12]:
# Download WordNet data
try:
    wn.download("omw-en")
except:
    print("WordNet data already downloaded or couldn't be downloaded.")

# Load contractions dictionary
try:
    with open("contractions_dict.json", "r") as contractions:
        contractions_dict = json.load(contractions)
except FileNotFoundError:
    print("Contractions dictionary not found. Creating a basic one.")
    contractions_dict = {
        "can't": "cannot", "won't": "will not", "don't": "do not",
        "doesn't": "does not", "i'm": "i am", "you're": "you are",
        "we're": "we are", "they're": "they are", "it's": "it is",
        "he's": "he is", "she's": "she is", "that's": "that is",
        "what's": "what is", "where's": "where is", "when's": "when is",
        "why's": "why is", "how's": "how is", "i've": "i have",
        "you've": "you have", "we've": "we have", "they've": "they have",
        "i'd": "i would", "you'd": "you would", "he'd": "he would",
        "she'd": "she would", "we'd": "we would", "they'd": "they would",
        "i'll": "i will", "you'll": "you will", "he'll": "he will",
        "she'll": "she will", "we'll": "we will", "they'll": "they will"
    }
    with open("contractions_dict.json", "w") as f:
        json.dump(contractions_dict, f)

[KCached file found: /root/.wn_data/downloads/3334cfd8709f5032fe246261d73528528c2542fa
[KSkipping omw-en:1.4 (OMW English Wordnet based on WordNet 3.0); already added



# Pre Processing Data

In [13]:
class PreProcessing:
    """Text preprocessing module with advanced tokenization"""

    class Tokenizer:
        """Advanced tokenizer with MWE recognition and contraction handling"""

        def __init__(self):
            try:
                self.wordnet = wn.Wordnet("omw-en")
                self.MWEs = self.list_MWEs()
            except:
                print("WordNet initialization failed. Using basic tokenization.")
                self.MWEs = []
            self.compile_regex_patterns()

        def list_MWEs(self):
            """Extract multi-word expressions (MWEs) from WordNet."""
            MWEs = []

            try:
                # Get multi-word nouns
                nouns = self.wordnet.synsets(pos="n")
                MWEs.extend([syn.lemmas()[0] for syn in nouns if " " in syn.lemmas()[0]])

                # Get multi-word verbs
                verbs = self.wordnet.synsets(pos="v")
                MWEs.extend([syn.lemmas()[0] for syn in verbs if " " in syn.lemmas()[0]])
            except:
                print("Error extracting MWEs. Using empty list.")

            return MWEs

        def compile_regex_patterns(self):
            """Compile all required regex patterns."""
            # Multi-word expressions handling
            if self.MWEs:
                mwe_patterns = [rf"\b{re.escape(mwe)}\b" for mwe in self.MWEs]
                self.regex_pattern = re.compile("|".join(mwe_patterns))
            else:
                self.regex_pattern = re.compile(r"")

            # Hyphen handling (e.g., "high-quality" → "high quality")
            self.hyphen_pattern = re.compile(r"\b(\w+)-(\w+)\b")

            # Preserve numbers with units (e.g., "10kg" → "10_kg", "$100" → "$100")
            self.number_unit_pattern = re.compile(r"(\d+)([a-zA-Z]+)")

            # Punctuation removal (except in preserved cases)
            self.punctuation_pattern = re.compile(r"[^\w\s\-_]")

            # Contractions patterns
            self.contraction_pattern = re.compile(r"\b(" + "|".join(map(re.escape, contractions_dict.keys())) + r")\b", re.IGNORECASE)

        def tokenize(self, text):
            """Tokenize the input text with preprocessing steps."""
            # Convert to lowercase
            text = text.lower()

            # Replace multi-word expressions with underscores
            if self.MWEs:
                text = self.regex_pattern.sub(lambda match: match.group(0).replace(" ", "_"), text)

            # Handle contractions
            text = self.contraction_pattern.sub(lambda match: contractions_dict.get(match.group(0).lower(), match.group(0)), text)

            # Handle hyphens (convert to spaces)
            text = self.hyphen_pattern.sub(r"\1 \2", text)

            # Preserve numbers with units
            text = self.number_unit_pattern.sub(r"\1_\2", text)

            # Remove other punctuation
            text = self.punctuation_pattern.sub("", text)

            # Tokenize by splitting on whitespace
            tokens = text.split()
            return tokens

    class tf_idf_Vectorizer:
        """TF-IDF Vectorizer implementation with advanced features"""

        def __init__(self, max_features=None):
            self.vocabulary = {}
            self.idf = {}
            self.max_features = max_features
            self.fitted = False

        def fit(self, corpus):
            """Build vocabulary with unique indices and compute IDF values."""
            if isinstance(corpus[0], str):
                # If corpus is a list of strings, tokenize them
                tokenizer = PreProcessing.Tokenizer()
                corpus = [tokenizer.tokenize(doc) for doc in corpus]
            elif not isinstance(corpus[0], list):
                raise ValueError("Corpus must be a list of strings or tokenized documents (list of lists).")

            # Count document frequency (DF) for each word
            df = Counter()
            for doc in corpus:
                unique_words = set(doc)
                df.update(unique_words)

            # Sort words by DF in descending order
            sorted_words = [word for word, _ in df.most_common(self.max_features)] if self.max_features else list(df.keys())

            # Create vocabulary
            self.vocabulary = {word: idx for idx, word in enumerate(sorted_words)}

            # Compute IDF with smoothing: log((N + 1) / (df + 1)) + 1
            N = len(corpus)
            self.idf = {word: np.log((N + 1) / (df[word] + 1)) + 1 for word in self.vocabulary}

            self.fitted = True
            return self

        def transform(self, documents):
            """Convert new documents into TF-IDF vectors using learned vocabulary."""
            if not self.fitted:
                raise ValueError("Vectorizer needs to be fitted before transform")

            if isinstance(documents[0], str):
                # If documents is a list of strings, tokenize them
                tokenizer = PreProcessing.Tokenizer()
                documents = [tokenizer.tokenize(doc) for doc in documents]
            elif not isinstance(documents[0], list):
                raise ValueError("Input documents must be strings or tokenized documents (list of lists).")

            tfidf_matrix = np.zeros((len(documents), len(self.vocabulary)))

            for i, doc in enumerate(documents):
                # Term frequency for the document
                tf = Counter(doc)
                total_words = len(doc)

                for word, count in tf.items():
                    if word in self.vocabulary:  # Ignore unseen words
                        word_idx = self.vocabulary[word]
                        tfidf_matrix[i][word_idx] = (count / total_words) * self.idf.get(word, 0)

            return tfidf_matrix

        def fit_transform(self, documents):
            """Fit and transform documents"""
            self.fit(documents)
            return self.transform(documents)


# Models

In [14]:
class SVMFromScratch:
    """Support Vector Machine classifier implemented from scratch"""

    def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iters=1000):
        self.lr = learning_rate
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.w = None
        self.b = None
        self.classes = None

    def _init_weights_bias(self, X):
        """Initialize weights and bias"""
        n_features = X.shape[1]
        self.w = np.zeros(n_features)
        self.b = 0

    def fit(self, X, y):
        """Fit SVM using gradient descent"""
        # Save original class labels
        self.classes = np.unique(y)

        # If binary classification, convert to {-1, 1}
        if len(self.classes) == 2:
            y_binary = np.where(y == self.classes[0], -1, 1)
        else:
            raise ValueError("This SVM implementation only supports binary classification")

        # Initialize weights and bias
        self._init_weights_bias(X)

        # Gradient descent
        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                # Calculate hinge loss condition
                condition = y_binary[idx] * (np.dot(x_i, self.w) - self.b) >= 1

                # Update weights
                if condition:
                    self.w = self.w - self.lr * (2 * self.lambda_param * self.w)
                else:
                    self.w = self.w - self.lr * (2 * self.lambda_param * self.w - np.dot(x_i, y_binary[idx]))
                    self.b = self.b - self.lr * y_binary[idx]

        return self

    def predict_raw(self, X):
        """Predict raw values"""
        return np.dot(X, self.w) - self.b

    def predict(self, X):
        """Predict class labels"""
        raw_preds = self.predict_raw(X)
        y_pred = np.where(raw_preds <= 0, self.classes[0], self.classes[1])
        return y_pred

    def predict_proba(self, X):
        """Predict class probabilities using sigmoid function"""
        raw_preds = self.predict_raw(X)
        # Apply sigmoid function
        sigmoid = lambda x: 1 / (1 + np.exp(-x))
        probs = sigmoid(raw_preds)

        # Return probabilities for both classes
        return np.column_stack((1 - probs, probs))


class NaiveBayesFromScratch:
    """Multinomial Naive Bayes classifier implemented from scratch"""

    def __init__(self, alpha=1.0):
        self.alpha = alpha  # Smoothing parameter
        self.class_priors = {}
        self.feature_probs = {}
        self.classes = None

    def fit(self, X, y):
        """Fit Naive Bayes model"""
        n_samples, n_features = X.shape
        self.classes = np.unique(y)
        n_classes = len(self.classes)

        # Calculate class priors
        for c in self.classes:
            self.class_priors[c] = np.sum(y == c) / n_samples

        # Calculate feature probabilities for each class
        for c in self.classes:
            # Get samples for this class
            X_c = X[y == c]

            # Calculate feature probabilities with Laplace smoothing
            # Sum of each feature for samples in this class
            feature_counts = np.sum(X_c, axis=0) + self.alpha
            # Total counts for this class (with smoothing)
            total_counts = np.sum(feature_counts)

            # Store probabilities
            self.feature_probs[c] = feature_counts / total_counts

        return self

    def predict_proba(self, X):
        """Predict class probabilities"""
        n_samples = X.shape[0]
        n_classes = len(self.classes)

        # Initialize probability matrix
        probs = np.zeros((n_samples, n_classes))

        # Calculate log probabilities for each class
        for i, c in enumerate(self.classes):
            # Class prior (in log space)
            class_prior = np.log(self.class_priors[c])
            # Feature probabilities (in log space)
            feature_probs = np.log(self.feature_probs[c])

            # Calculate log probability for each sample
            for j, x in enumerate(X):
                # Only consider non-zero features (TF-IDF scores)
                non_zero_indices = x > 0
                if np.any(non_zero_indices):
                    # Multiply feature values by log probabilities
                    log_prob = class_prior + np.sum(x[non_zero_indices] * feature_probs[non_zero_indices])
                else:
                    log_prob = class_prior

                probs[j, i] = log_prob

        # Convert log probabilities to probabilities
        # Subtract max for numerical stability
        probs_exp = np.exp(probs - np.max(probs, axis=1, keepdims=True))
        probs_normalized = probs_exp / np.sum(probs_exp, axis=1, keepdims=True)

        return probs_normalized

    def predict(self, X):
        """Predict class labels"""
        probs = self.predict_proba(X)
        return self.classes[np.argmax(probs, axis=1)]


class DecisionTreeFromScratch:
    """Decision Tree classifier implemented from scratch"""

    class Node:
        """Node in decision tree"""
        def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
            self.feature = feature      # Index of feature to split on
            self.threshold = threshold  # Threshold value for split
            self.left = left            # Left subtree (True branch)
            self.right = right          # Right subtree (False branch)
            self.value = value          # Class label (for leaf nodes)

    def __init__(self, max_depth=10, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None
        self.classes = None

    def fit(self, X, y):
        """Build decision tree"""
        self.classes = np.unique(y)
        self.root = self._grow_tree(X, y)
        return self

    def _grow_tree(self, X, y, depth=0):
        """Recursively grow tree"""
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))

        # Stopping criteria
        if (depth >= self.max_depth or
            n_samples < self.min_samples_split or
            n_classes == 1):
            # Create leaf node
            leaf_value = self._most_common_label(y)
            return self.Node(value=leaf_value)

        # Find best split
        best_feature, best_threshold = self._best_split(X, y, n_features)

        # Create subtrees
        left_idxs = X[:, best_feature] < best_threshold
        right_idxs = ~left_idxs

        # Recursively build subtrees
        left_subtree = self._grow_tree(X[left_idxs], y[left_idxs], depth + 1)
        right_subtree = self._grow_tree(X[right_idxs], y[right_idxs], depth + 1)

        return self.Node(best_feature, best_threshold, left_subtree, right_subtree)

    def _best_split(self, X, y, n_features):
        """Find best feature and threshold for split"""
        best_gain = -float('inf')
        best_feature = None
        best_threshold = None

        # Try a limited number of features (for efficiency)
        features_to_try = random.sample(range(n_features), min(n_features, 20))

        for feature in features_to_try:
            # Get unique values for this feature
            thresholds = np.unique(X[:, feature])

            # Try different thresholds
            for threshold in thresholds:
                # Calculate information gain
                gain = self._information_gain(y, X[:, feature], threshold)

                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _information_gain(self, y, feature_values, threshold):
        """Calculate information gain for a split"""
        # Calculate parent entropy
        parent_entropy = self._entropy(y)

        # Create children
        left_idxs = feature_values < threshold
        right_idxs = ~left_idxs

        # If split creates empty node, return no gain
        if np.sum(left_idxs) == 0 or np.sum(right_idxs) == 0:
            return 0

        # Calculate weighted entropy of children
        n = len(y)
        n_left, n_right = np.sum(left_idxs), np.sum(right_idxs)

        entropy_left = self._entropy(y[left_idxs])
        entropy_right = self._entropy(y[right_idxs])

        child_entropy = (n_left / n) * entropy_left + (n_right / n) * entropy_right

        # Calculate information gain
        information_gain = parent_entropy - child_entropy
        return information_gain

    def _entropy(self, y):
        """Calculate entropy of a set"""
        hist = np.bincount(y)
        ps = hist / len(y)
        ps = ps[ps > 0]  # Remove zeros
        return -np.sum(ps * np.log2(ps))

    def _most_common_label(self, y):
        """Find most common label in a set"""
        counter = Counter(y)
        return counter.most_common(1)[0][0]

    def predict(self, X):
        """Predict class labels"""
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        """Traverse tree to make prediction"""
        if node.value is not None:
            return node.value

        if x[node.feature] < node.threshold:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)

    def predict_proba(self, X):
        """Predict class probabilities"""
        n_samples = X.shape[0]
        n_classes = len(self.classes)
        probas = np.zeros((n_samples, n_classes))

        for i, x in enumerate(X):
            # Traverse tree to find leaf node
            leaf_value = self._traverse_tree(x, self.root)
            # Set probability to 1.0 for predicted class
            class_idx = np.where(self.classes == leaf_value)[0][0]
            probas[i, class_idx] = 1.0

        return probas


class RandomForestFromScratch:
    """Random Forest classifier implemented from scratch"""

    def __init__(self, n_trees=10, max_depth=10, min_samples_split=2, sample_ratio=0.8):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.sample_ratio = sample_ratio
        self.trees = []
        self.classes = None

    def fit(self, X, y):
        """Build random forest"""
        self.classes = np.unique(y)

        # Create bootstrap samples and train trees
        for _ in range(self.n_trees):
            # Bootstrap sampling
            n_samples = X.shape[0]
            sample_size = int(n_samples * self.sample_ratio)

            # Sample with replacement
            idxs = np.random.choice(n_samples, size=sample_size, replace=True)
            X_sample, y_sample = X[idxs], y[idxs]

            # Train decision tree
            tree = DecisionTreeFromScratch(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split
            )
            tree.fit(X_sample, y_sample)

            # Add tree to forest
            self.trees.append(tree)

        return self

    def predict(self, X):
        """Predict class labels by majority vote"""
        # Get predictions from all trees
        tree_preds = np.array([tree.predict(X) for tree in self.trees])

        # Transpose to get one row per sample, with columns as tree predictions
        tree_preds = tree_preds.T

        # Apply majority vote for each sample
        final_preds = np.array([self._majority_vote(pred) for pred in tree_preds])

        return final_preds

    def _majority_vote(self, predictions):
        """Apply majority voting"""
        counter = Counter(predictions)
        return counter.most_common(1)[0][0]

    def predict_proba(self, X):
        """Predict class probabilities"""
        n_samples = X.shape[0]
        n_classes = len(self.classes)

        # Initialize probabilities
        probas = np.zeros((n_samples, n_classes))

        # Get predictions from all trees
        for tree in self.trees:
            tree_proba = tree.predict_proba(X)
            probas += tree_proba

        # Average probabilities
        probas /= self.n_trees

        return probas


def cosine_similarity_from_scratch(A, B):
    """Calculate cosine similarity between matrices"""
    # Compute dot product
    dot_product = np.dot(A, B.T)

    # Compute norms
    norm_A = np.sqrt(np.sum(A**2, axis=1, keepdims=True))
    norm_B = np.sqrt(np.sum(B**2, axis=1, keepdims=True))

    # Avoid division by zero
    norm_A[norm_A == 0] = 1e-10
    norm_B[norm_B == 0] = 1e-10

    # Compute similarity
    similarity = dot_product / (norm_A * norm_B.T)

    return similarity

# Intent Classifier

In [15]:
class EnhancedIntentClassifier:
    """Intent classification with advanced preprocessing"""

    def __init__(self, model_type='naive_bayes', max_features=1000):
        self.model_type = model_type
        self.tokenizer = PreProcessing.Tokenizer()
        self.vectorizer = PreProcessing.tf_idf_Vectorizer(max_features=max_features)

        # Choose model based on type
        if model_type == 'svm':
            self.model = SVMFromScratch(learning_rate=0.01, lambda_param=0.01, n_iters=1000)
        elif model_type == 'naive_bayes':
            self.model = NaiveBayesFromScratch(alpha=1.0)
        elif model_type == 'random_forest':
            self.model = RandomForestFromScratch(n_trees=10, max_depth=5)
        else:
            raise ValueError(f"Unsupported model type: {model_type}")

    def train(self, texts, labels, test_size=0.2):
        """Train the intent classifier"""
        print(f"Tokenizing {len(texts)} training examples...")
        # Tokenize texts
        tokenized_texts = [self.tokenizer.tokenize(text) for text in texts]

        print("Vectorizing text...")
        # Vectorize text
        X_vec = self.vectorizer.fit(tokenized_texts)
        X_vec = self.vectorizer.transform(tokenized_texts)

        # Convert labels to numpy array if needed
        y = np.array(labels)

        # Split data for evaluation
        n_samples = len(texts)
        indices = np.arange(n_samples)
        np.random.shuffle(indices)

        train_size = int((1 - test_size) * n_samples)
        train_indices = indices[:train_size]
        test_indices = indices[train_size:]

        X_train = X_vec[train_indices]
        y_train = y[train_indices]
        X_test = X_vec[test_indices]
        y_test = y[test_indices]

        print(f"Training {self.model_type} model...")
        # Train model
        self.model.fit(X_train, y_train)

        # Evaluate
        y_pred = self.model.predict(X_test)
        accuracy = np.mean(y_pred == y_test)
        print(f"Intent Classifier Accuracy: {accuracy:.4f}")

        return self

    def predict(self, text):
        """Predict intent from text"""
        # Tokenize and vectorize
        tokens = self.tokenizer.tokenize(text)
        X_vec = self.vectorizer.transform([tokens])

        # Predict
        intent = self.model.predict(X_vec)[0]

        # Get probability
        prob_matrix = self.model.predict_proba(X_vec)

        # Find probability for predicted class
        class_idx = np.where(self.model.classes == intent)[0][0]
        prob = prob_matrix[0, class_idx]

        return intent, prob


# Entity Exctractor

In [16]:
class EntityExtractorFromScratch:
    """Rule-based entity extraction"""

    def __init__(self):
        self.entity_patterns = {}

    def add_entity_pattern(self, entity_name, pattern):
        """Add regex pattern for entity extraction"""
        self.entity_patterns[entity_name] = re.compile(pattern, re.IGNORECASE)

    def extract_entities(self, text):
        """Extract entities from text based on patterns"""
        entities = {}
        for entity_name, pattern in self.entity_patterns.items():
            matches = pattern.findall(text)
            if matches:
                entities[entity_name] = matches
        return entities

# Response Generator

In [17]:
class ResponseGeneratorFromScratch:
    """Response generation using retrieval-based approach"""

    def __init__(self):
        self.tokenizer = PreProcessing.Tokenizer()
        self.vectorizer = PreProcessing.tf_idf_Vectorizer()
        self.responses_df = None
        self.response_vectors = None

    def train(self, intents_responses_df):
        """Train response generator on intent-response pairs"""
        self.responses_df = intents_responses_df

        # Tokenize intent templates
        tokenized_templates = [self.tokenizer.tokenize(template)
                               for template in self.responses_df['intent_template'].tolist()]

        # Vectorize intent templates
        self.vectorizer.fit(tokenized_templates)
        self.response_vectors = self.vectorizer.transform(tokenized_templates)

        return self

    def generate_response(self, user_input, intent=None, entities=None, threshold=0.3):
        """Generate response based on user input"""
        # If intent is provided, use it
        if intent and intent in set(self.responses_df['intent']):
            # Filter responses by intent
            intent_responses = self.responses_df[self.responses_df['intent'] == intent]
            # Select random response for this intent
            response_idx = random.randint(0, len(intent_responses) - 1)
            response = intent_responses.iloc[response_idx]['response']
            return response, 1.0

        # Otherwise, use retrieval-based approach
        tokens = self.tokenizer.tokenize(user_input)
        input_vector = self.vectorizer.transform([tokens])

        # Calculate similarities
        similarities = cosine_similarity_from_scratch(input_vector, self.response_vectors)

        # Get best match
        best_match_idx = np.argmax(similarities[0])
        best_match_score = similarities[0][best_match_idx]

        # Check if similarity is above threshold
        if best_match_score >= threshold:
            response = self.responses_df.iloc[best_match_idx]['response']
            return response, best_match_score
        else:
            return "I'm not sure how to respond to that.", 0.0

# Dialogue Manager

In [18]:
class DialogueManagerFromScratch:
    """Simple state-based dialogue manager"""

    def __init__(self):
        self.context = {}
        self.current_state = "greeting"
        self.state_transitions = {}

    def add_state_transition(self, current_state, intent, next_state):
        """Define state transition based on intent"""
        if current_state not in self.state_transitions:
            self.state_transitions[current_state] = {}
        self.state_transitions[current_state][intent] = next_state

    def update_state(self, intent):
        """Update dialogue state based on intent"""
        if self.current_state in self.state_transitions and intent in self.state_transitions[self.current_state]:
            self.current_state = self.state_transitions[self.current_state][intent]
        return self.current_state

    def update_context(self, entities):
        """Update context with extracted entities"""
        if entities:
            for entity_type, values in entities.items():
                self.context[entity_type] = values

    def get_context(self):
        """Get current dialogue context"""
        return self.context

    def reset(self):
        """Reset dialogue manager"""
        self.context = {}
        self.current_state = "greeting"

# Chat Bot

In [19]:
class EnhancedChatbot:
    """Chatbot with advanced preprocessing and dialogue management - DailyDialog Dataset Support"""

    def __init__(self, bot_name="EnhancedBot", model_type):
        self.name = bot_name
        self.intent_classifier = EnhancedIntentClassifier(model_type=model_type)
        self.entity_extractor = EntityExtractorFromScratch()
        self.response_generator = ResponseGeneratorFromScratch()
        self.dialogue_manager = DialogueManagerFromScratch()

        # Topic mapping
        self.topic_mapping = {
            1: "Ordinary Life",
            2: "School Life",
            3: "Culture & Education",
            4: "Attitude & Emotion",
            5: "Relationship",
            6: "Tourism",
            7: "Health",
            8: "Work",
            9: "Politics",
            10: "Finance"
        }

        # Dialog act mapping
        self.act_mapping = {
            1: "inform",
            2: "question",
            3: "directive",
            4: "commissive"
        }

        # Emotion mapping
        self.emotion_mapping = {
            0: "neutral",
            1: "anger",
            2: "disgust",
            3: "fear",
            4: "happiness",
            5: "sadness",
            6: "surprise"
        }

    def train(self, dialogues_file="dialogues_text.txt", topics_file="dialogues_topic.txt",
              acts_file="dialogues_act.txt", emotions_file="dialogues_emotion.txt", test_size=0.2):
        """Train the chatbot on DailyDialog dataset"""
        print(f"Loading DailyDialog dataset...")

        try:
            # Load the dialogue files
            with open(dialogues_file, 'r', encoding='utf-8') as f:
                dialogues = f.readlines()

            with open(topics_file, 'r', encoding='utf-8') as f:
                topics = f.readlines()

            with open(acts_file, 'r', encoding='utf-8') as f:
                acts = f.readlines()

            with open(emotions_file, 'r', encoding='utf-8') as f:
                emotions = f.readlines()

            # Process the data into structured format
            processed_data = self._process_dailydialog_data(dialogues, topics, acts, emotions)

            # Train intent classifier (using dialog acts as intents)
            print(f"Training intent classifier on {len(processed_data['utterances'])} examples")
            self.intent_classifier.train(
                texts=processed_data['utterances'],
                labels=processed_data['acts_labels'],
                test_size=test_size
            )

            # Add entity patterns
            self._setup_entity_extractor()

            # Prepare response generator data
            responses_df = self._prepare_response_data(processed_data)

            # Train response generator
            self.response_generator.train(responses_df)

            # Setup dialogue flow based on dialog acts
            self._setup_dailydialog_flow()

            print(f"{self.name} is now trained and ready to chat!")
            return self

        except Exception as e:
            print(f"Error loading DailyDialog dataset: {e}")
            print("Creating sample data instead...")
            # Fall back to sample data if files not found
            return self._train_with_sample_data(test_size)

    def _process_dailydialog_data(self, dialogues, topics, acts, emotions):
        """Process DailyDialog dataset into structured format"""
        utterances = []
        topic_labels = []
        acts_labels = []
        emotion_labels = []
        dialogue_ids = []
        responses = []

        for i, (dialogue, topic, act_seq, emotion_seq) in enumerate(zip(dialogues, topics, acts, emotions)):
            # Split the dialogue into utterances
            dialogue_utterances = dialogue.strip().split('__eou__')

            # Clean up the last element which might be empty
            if dialogue_utterances and dialogue_utterances[-1].strip() == '':
                dialogue_utterances = dialogue_utterances[:-1]

            # Get topic
            topic_id = int(topic.strip())
            topic_name = self.topic_mapping.get(topic_id, "Unknown")

            # Get acts and emotions
            acts_ids = [int(a) for a in act_seq.strip().split()]
            acts_names = [self.act_mapping.get(a, "Unknown") for a in acts_ids]

            emotion_ids = [int(e) for e in emotion_seq.strip().split()]
            emotion_names = [self.emotion_mapping.get(e, "Unknown") for e in emotion_ids]

            # Check if the lengths match
            min_len = min(len(dialogue_utterances), len(acts_ids), len(emotion_ids))

            for j in range(min_len):
                utterance = dialogue_utterances[j].strip()
                if not utterance:
                    continue

                # Store the utterance data
                utterances.append(utterance)
                topic_labels.append(topic_name)
                acts_labels.append(acts_names[j])
                emotion_labels.append(emotion_names[j])
                dialogue_ids.append(i)

                # For each utterance, find its response (next utterance in dialogue)
                if j < min_len - 1:
                    response = dialogue_utterances[j+1].strip()
                    responses.append(response)
                else:
                    # If it's the last utterance, use a generic response
                    responses.append("I understand.")

        return {
            'utterances': utterances,
            'topic_labels': topic_labels,
            'acts_labels': acts_labels,
            'emotion_labels': emotion_labels,
            'dialogue_ids': dialogue_ids,
            'responses': responses
        }

    def _prepare_response_data(self, processed_data):
        """Prepare response data for training the response generator"""
        # Create a dataframe with intent (act), template, and response
        data = {
            'intent': processed_data['acts_labels'],
            'intent_template': processed_data['utterances'],
            'response': processed_data['responses'],
            'emotion': processed_data['emotion_labels'],
            'topic': processed_data['topic_labels']
        }

        # Create DataFrame
        responses_df = pd.DataFrame(data)

        # Add fallback responses
        fallback_responses = [
            "I'm not sure I understand. Could you rephrase your question?",
            "I didn't catch that. Can you try asking in a different way?",
            "I'm a bit confused. Could you elaborate?",
            "I'm still learning. Could you try expressing that differently?",
            f"I'm {self.name}, and I'm trying to understand. Could you provide more context?"
        ]

        for resp in fallback_responses:
            new_row = {
                'intent': 'fallback',
                'intent_template': 'unknown query',
                'response': resp,
                'emotion': 'neutral',
                'topic': 'Unknown'
            }
            responses_df = pd.concat([responses_df, pd.DataFrame([new_row])], ignore_index=True)

        return responses_df

    def _train_with_sample_data(self, test_size=0.2):
        """Train with sample data when DailyDialog data is not available"""
        print("Training with sample data...")
        intents_df, responses_df = self._create_sample_data()

        # Train intent classifier
        self.intent_classifier.train(
            texts=intents_df['text'].tolist(),
            labels=intents_df['intent'].tolist(),
            test_size=test_size
        )

        # Add entity patterns
        self._setup_entity_extractor()

        # Train response generator
        self.response_generator.train(responses_df)

        # Setup dialogue flow
        self._setup_dialogue_flow()

        print(f"{self.name} is now trained with sample data and ready to chat!")
        return self

    def _create_sample_data(self):
        """Create sample training data"""
        # Sample intents
        intents_data = {
            'text': [
                "hello", "hi there", "hey", "howdy",
                "goodbye", "bye", "see you later", "see ya",
                "what time is it", "tell me the time", "what's the current time",
                "what's the weather like", "how's the weather today", "tell me about the weather",
                "what can you do", "what are your features", "help me", "what are your capabilities",
                "tell me a joke", "say something funny", "do you know any jokes",
                "who are you", "what are you", "tell me about yourself"
            ],
            'intent': [
                "greeting", "greeting", "greeting", "greeting",
                "goodbye", "goodbye", "goodbye", "goodbye",
                "time_query", "time_query", "time_query",
                "weather_query", "weather_query", "weather_query",
                "help", "help", "help", "help",
                "joke", "joke", "joke",
                "bot_identity", "bot_identity", "bot_identity"
            ]
        }

        # Sample responses
        responses_data = {
            'intent': [
                "greeting", "greeting", "greeting",
                "goodbye", "goodbye", "goodbye",
                "time_query", "time_query",
                "weather_query", "weather_query",
                "help", "help",
                "joke", "joke", "joke",
                "bot_identity", "bot_identity",
                "fallback", "fallback"
            ],
            'intent_template': [
                "hello", "hi there", "hey",
                "goodbye", "bye", "see you later",
                "what time is it", "tell me the time",
                "what's the weather like", "how's the weather today",
                "what can you do", "help me",
                "tell me a joke", "say something funny", "do you know any jokes",
                "who are you", "what are you",
                "unknown query", "I don't understand"
            ],
            'response': [
                "Hello! How can I help you today?",
                "Hi there! What can I do for you?",
                "Hey! What's up?",
                "Goodbye! Have a great day!",
                "Bye! Talk to you later!",
                "See you soon! Take care!",
                "I'm sorry, I don't have access to the current time.",
                "I can't tell you the exact time right now.",
                "I don't have access to weather information currently.",
                "I can't check the weather for you at the moment.",
                f"I'm {self.name}, a chatbot that can understand your intents and respond accordingly.",
                "I can help with basic conversation, answer questions about myself, and more!",
                "Why don't scientists trust atoms? Because they make up everything!",
                "What do you call fake spaghetti? An impasta!",
                "I would tell you a joke about UDP, but you might not get it.",
                f"I'm {self.name}, an AI assistant built with Python from scratch!",
                "I'm a custom-built chatbot designed to understand intents and respond naturally.",
                "I'm not sure I understand. Could you rephrase your question?",
                "I didn't catch that. Can you try asking in a different way?"
            ],
            'emotion': ['neutral'] * 19,  # Add neutral emotion for all sample responses
            'topic': ['Ordinary Life'] * 19  # Add default topic for all sample responses
        }

        # Create DataFrames
        intents_df = pd.DataFrame(intents_data)
        responses_df = pd.DataFrame(responses_data)

        return intents_df, responses_df

    def _setup_entity_extractor(self):
        """Setup entity extraction rules"""
        # Add common patterns for entity extraction
        self.entity_extractor.add_entity_pattern(
            "date",
            r"\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?,?\s+\d{4}\b|\b\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4}\b|\b(?:today|tomorrow|yesterday)\b"
        )

        self.entity_extractor.add_entity_pattern(
            "time",
            r"\b(?:\d{1,2}:\d{2}(?::\d{2})?(?:\s*[ap]\.?m\.?)?|\d{1,2}\s*[ap]\.?m\.?)\b"
        )

        self.entity_extractor.add_entity_pattern(
            "location",
            r"\b(?:[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),?\s+(?:[A-Z]{2}|[A-Z][a-z]+)\b"
        )

        self.entity_extractor.add_entity_pattern(
            "number",
            r"\b\d+(?:\.\d+)?\b"
        )

        self.entity_extractor.add_entity_pattern(
            "email",
            r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
        )

        # Add patterns for topic-specific entities
        self.entity_extractor.add_entity_pattern(
            "health_terms",
            r"\b(?:doctor|hospital|clinic|symptoms|treatment|medicine|therapy|disease|illness|patient|diagnosis|healthcare|medical|prescription|allergy|vaccination|surgery)\b"
        )

        self.entity_extractor.add_entity_pattern(
            "finance_terms",
            r"\b(?:money|bank|account|loan|credit|debit|investment|stocks|bonds|mortgage|interest|finance|financial|budget|savings|debt|payment|transaction|invoice|currency|dollar|euro|pound|yen|yuan)\b"
        )

    def _setup_dailydialog_flow(self):
        """Setup dialogue flow based on DailyDialog acts"""
        # Clear existing transitions
        self.dialogue_manager.state_transitions = {}

        # Initial state
        self.dialogue_manager.current_state = "initial"

        # Add transitions based on dialog acts
        self.dialogue_manager.add_state_transition("initial", "inform", "responding_to_inform")
        self.dialogue_manager.add_state_transition("initial", "question", "responding_to_question")
        self.dialogue_manager.add_state_transition("initial", "directive", "responding_to_directive")
        self.dialogue_manager.add_state_transition("initial", "commissive", "responding_to_commissive")

        # From each state, allow transitions to any other state
        for source_state in ["responding_to_inform", "responding_to_question",
                             "responding_to_directive", "responding_to_commissive"]:
            for act in ["inform", "question", "directive", "commissive"]:
                target_state = f"responding_to_{act}"
                self.dialogue_manager.add_state_transition(source_state, act, target_state)

    def _setup_dialogue_flow(self):
        """Setup dialogue flow for sample data"""
        # Define state transitions
        self.dialogue_manager.add_state_transition("greeting", "greeting", "awaiting_query")
        self.dialogue_manager.add_state_transition("greeting", "help", "help")
        self.dialogue_manager.add_state_transition("greeting", "bot_identity", "identity")

        self.dialogue_manager.add_state_transition("awaiting_query", "time_query", "time_response")
        self.dialogue_manager.add_state_transition("awaiting_query", "weather_query", "weather_response")
        self.dialogue_manager.add_state_transition("awaiting_query", "joke", "joke_response")
        self.dialogue_manager.add_state_transition("awaiting_query", "help", "help")
        self.dialogue_manager.add_state_transition("awaiting_query", "goodbye", "farewell")
        self.dialogue_manager.add_state_transition("awaiting_query", "bot_identity", "identity")

        self.dialogue_manager.add_state_transition("time_response", "greeting", "awaiting_query")
        self.dialogue_manager.add_state_transition("time_response", "goodbye", "farewell")

        self.dialogue_manager.add_state_transition("weather_response", "greeting", "awaiting_query")
        self.dialogue_manager.add_state_transition("weather_response", "goodbye", "farewell")

        self.dialogue_manager.add_state_transition("joke_response", "greeting", "awaiting_query")
        self.dialogue_manager.add_state_transition("joke_response", "goodbye", "farewell")

        self.dialogue_manager.add_state_transition("help", "greeting", "awaiting_query")
        self.dialogue_manager.add_state_transition("help", "goodbye", "farewell")

        self.dialogue_manager.add_state_transition("identity", "greeting", "awaiting_query")
        self.dialogue_manager.add_state_transition("identity", "goodbye", "farewell")

        self.dialogue_manager.add_state_transition("farewell", "greeting", "awaiting_query")

    def process(self, user_input):
        """Process user input and generate response"""
        # Classify intent
        intent, confidence = self.intent_classifier.predict(user_input)

        # Extract entities
        entities = self.entity_extractor.extract_entities(user_input)

        # Update dialogue context with entities
        self.dialogue_manager.update_context(entities)

        # Update dialogue state
        current_state = self.dialogue_manager.update_state(intent)

        # Generate response
        response, response_confidence = self.response_generator.generate_response(
            user_input,
            intent=intent if confidence > 0.5 else None,
            entities=entities
        )

        # Format response with entity information if applicable
        response = self._format_response_with_entities(response, entities)

        return {
            "response": response,
            "intent": intent,
            "intent_confidence": confidence,
            "entities": entities,
            "state": current_state
        }

    def _format_response_with_entities(self, response, entities):
        """Format response with extracted entity information"""
        formatted_response = response

        if entities:
            entity_info = []
            for entity_type, values in entities.items():
                if values:
                    entity_info.append(f"{entity_type}: {', '.join(values)}")

            if entity_info:
                # Only add entity information if relevant to the response
                if "I detected" not in formatted_response and "identified" not in formatted_response:
                    entity_str = ", ".join(entity_info)
                    formatted_response += f"\n\nI detected: {entity_str}"

        return formatted_response

    def chat(self):
        """Interactive chat session"""
        print(f"\n{self.name}: Hello! How can I help you today? (type 'exit' to quit)")

        while True:
            user_input = input("You: ").strip()

            if user_input.lower() in ['exit', 'quit', 'bye']:
                print(f"{self.name}: Goodbye! Have a great day!")
                break

            result = self.process(user_input)
            print(f"{self.name}: {result['response']}")

            # Optionally show more details about processing
            # print(f"Debug: Intent: {result['intent']} ({result['intent_confidence']:.2f}), State: {result['state']}")

    def reset(self):
        """Reset chatbot state"""
        self.dialogue_manager.reset()
        print(f"{self.name}: I've reset my conversation memory.")

In [20]:
# Create and train chatbot with DailyDialog dataset
chatbot = EnhancedChatbot(bot_name="DailyBot", model_type="naive_bayes")

# Train with DailyDialog data
chatbot.train(
    dialogues_file="dialogues_text.txt",
    topics_file="dialogues_topic.txt",
    acts_file="dialogues_act.txt",
    emotions_file="dialogues_emotion.txt"
)


Loading DailyDialog dataset...
Training intent classifier on 102979 examples
Tokenizing 102979 training examples...
Vectorizing text...
Training naive_bayes model...
Intent Classifier Accuracy: 0.6618
DailyBot is now trained and ready to chat!


<__main__.EnhancedChatbot at 0x7f9121c5de90>

In [21]:
# Start interactive chat
chatbot.chat()


DailyBot: Hello! How can I help you today? (type 'exit' to quit)
You: Hi
DailyBot: Sure.He has already married , a father of two boys .
You: exit
DailyBot: Goodbye! Have a great day!


In [None]:
# Create and train chatbot with DailyDialog dataset
chatbot2 = EnhancedChatbot(bot_name="DailyBot", model_type="svm")

# Train with DailyDialog data
chatbot.train(
    dialogues_file="dialogues_text.txt",
    topics_file="dialogues_topic.txt",
    acts_file="dialogues_act.txt",
    emotions_file="dialogues_emotion.txt"
)


Error extracting MWEs. Using empty list.
Loading DailyDialog dataset...
Training intent classifier on 102979 examples
Tokenizing 102979 training examples...
Vectorizing text...
Training naive_bayes model...
Intent Classifier Accuracy: 0.6652


In [None]:
chatbot2.chat()

In [None]:
# Create and train chatbot with DailyDialog dataset
chatbot3 = EnhancedChatbot(bot_name="DailyBot", model_type="random_forest")

# Train with DailyDialog data
chatbot.train(
    dialogues_file="dialogues_text.txt",
    topics_file="dialogues_topic.txt",
    acts_file="dialogues_act.txt",
    emotions_file="dialogues_emotion.txt"
)

In [None]:
chatbot3.chat()