In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random

# Preprocessing
def preprocess(text):
    # Tokenization
    tokens = word_tokenize(text.lower())

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Return preprocessed text as string
    return " ".join(lemmas)

# Load conversation dataset
def load_dataset(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        conversations = file.readlines()
    return conversations

# Preprocess dataset
def preprocess_dataset(dataset):
    preprocessed_dataset = []
    for conversation in dataset:
        preprocessed_conversation = preprocess(conversation)
        preprocessed_dataset.append(preprocessed_conversation)
    return preprocessed_dataset

# Generate response
def generate_response(user_input, vectorizer, vectorized_corpus, corpus):
    preprocessed_input = preprocess(user_input)
    vectorized_input = vectorizer.transform([preprocessed_input])
    cosine_similarities = cosine_similarity(vectorized_input, vectorized_corpus).flatten()
    most_similar_index = np.argmax(cosine_similarities)
    response = corpus[most_similar_index]
    if cosine_similarities[most_similar_index] < accuracy_rate:
        response = random.choice(chatbot_responses["fallback"])
    return response

# Load and preprocess the conversation dataset
conversation_dataset = load_dataset('training_dataset.txt')
preprocessed_dataset = preprocess_dataset(conversation_dataset)

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
vectorized_corpus = vectorizer.fit_transform(preprocessed_dataset)

# Calculate accuracy rate on new user queries
accuracy_rate = 0.85

# Define chatbot responses
chatbot_responses = {
    "fallback": ["I'm sorry, I didn't understand. Can you please rephrase your query?", "Apologies, I'm not programmed to respond to that. Can you ask something else?", "I'm afraid I can't provide an answer to that."]
}

# Deploy the datadex_chatbot model on Hugging Face or any other platform

# Chat with the datadex_chatbot
while True:
    user_input = input("User: ")
    response = generate_response(user_input, vectorizer, vectorized_corpus, conversation_dataset)
    print("datadex_chatbot:", response)
