## Importing files

In [129]:
import nltk
import random
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
from nltk import word_tokenize, sent_tokenize
from nltk.lm import MLE
from nltk.lm.preprocessing import padded_everygram_pipeline
from random import choice
from nltk import bigrams, FreqDist
from nltk.util import ngrams

## Data Pre-processing function

In [119]:
# Data preprocessing

def preprocess_text_file(file_path):
    # Read the text file
    with open(file_path, 'r') as file:
        text = file.read()

    # Convert the text to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Tokenize the text into words or tokens
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Perform lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove numbers or digits
    tokens = [token for token in tokens if not token.isdigit()]

    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    # Rejoin tokens into a normalized text
    normalized_text = ' '.join(tokens)

    return normalized_text

normalized_text =preprocess_text_file('data.txt.txt')

In [120]:
normalized_text

'natural language processing nlp field artificial intelligence focus interaction computer human language combine technique linguistics computer science machine learning enable machine understand interpret generate human language nlp revolutionized various application including machine translation sentiment analysis chatbots information retrieval here 500word text providing overview nlp natural language processing nlp branch artificial intelligence deal interaction computer human language aim enable computer understand interpret respond human language meaningful way one fundamental challenge nlp ambiguity complexity natural language human language exhibit wide range variation including different dialect slang idiom cultural reference additionally word often multiple meaning context play crucial role understanding intended sense tackle challenge nlp employ various technique algorithm one key component text preprocessing involves task like tokenization stemming partofspeech tagging tokeni

## Data Preperation 

In [122]:
#Prepare the dataset

old_text = normalized_text
new_text = "your_new_text_data"
combined_text = old_text + " " + new_text

In [123]:
# Tokenize the text

sent_tokens = sent_tokenize(combined_text)
word_tokens = [word_tokenize(t) for t in sent_tokens]

## Trigram Model

In [124]:
#Create a trigram model

n = 3
train_data, padded_sents = padded_everygram_pipeline(n, word_tokens)

In [125]:
#Train the model with more text

model = MLE(n)
model.fit(train_data, padded_sents)

In [126]:
#Generate text with various questions

def generate_text(prompt, num_words, model):
    word_list = model.generate(num_words, text_seed=prompt.split())
    response = ' '.join(word_list)
    return response

In [127]:
# Example questions
questions = [
    "What is the importance",
    "How does it work",
    "What are the benefits",
    "How can I improve",
    "What should I consider"
]

In [128]:
for question in questions:
    print(f"Question: {question}")
    print(f"Answer: {generate_text(question, 20, model)}")
    print("\n")

Question: What is the importance
Answer: based user query text classification information retrieval system employ technique like keyword matching relevance ranking retrieve relevant document based user


Question: How does it work
Answer: modeling machine translation sentiment analysis employ machine learning algorithm classify text positive negative neutral sentiment machine translation sentiment analysis aim


Question: What are the benefits
Answer: challenge nlp employ various technique algorithm one key component text preprocessing involves task like information extraction question answering system leverage


Question: How can I improve
Answer: language data enabling accurate nuanced language processing nlp branch artificial intelligence focus interaction computer human language aim enable computer understand


Question: What should I consider
Answer: involves automatic translation text generation model capture complex pattern dependency language data enabling accurate translation 

## Bi-gram Model

In [130]:
# Create a bigram model:

# Generate bigrams and their frequency distribution
bigrams = list(ngrams(tokens, 2))
bigram_freq_dist = FreqDist(bigrams)

# Prepare the dataset for training
train_data, padded_sents = padded_everygram_pipeline(2, word_tokens)


In [131]:
# Train the bigram model
model = MLE(2)
model.fit(train_data, padded_sents)

In [132]:
# Generate text based on user inputs:
def generate_sentence(model, num_words, seed_word):
    sentence = [seed_word]
    for _ in range(num_words - 1):
        next_word = model.generate(1, text_seed=sentence)
        sentence.append(next_word)

    return ' '.join(sentence)

In [133]:
# Example questions to the model
questions = [
    "What is natural language processing?",
    "How does artificial intelligence relate to linguistics?",
    "Can computers understand human language?",
]

In [135]:
# Generate answers for the questions
for question in questions:
    tokens = nltk.word_tokenize(question)
    seed_word = choice(tokens)
    generated_sentence = generate_sentence(model, 30, seed_word)
    print(f"Q: {question}\nA: {generated_sentence}\n")

Q: What is natural language processing?
A: What focus interaction computer human language pair question answering system your_new_text_data </s> neural machine translation sentiment analysis chatbots information large volume text application social medium monitoring brand reputation analysis employ

Q: How does artificial intelligence relate to linguistics?
A: artificial intelligence focus enabling accurate translation across different dialect slang idiom cultural reference additionally word token stemming partofspeech tagging tokenization break text positive negative neutral sentiment analysis large knowledge base

Q: Can computers understand human language?
A: computers machine translation sentiment analysis customer feedback analysis another prominent application including machine learning enable computer human language nlp technique identify extract important area nlp continues advance hold potential revolutionize

