# WEEK 7: VI. TEXT SUMMARIZATION
## a. BASIC TEXT SUMMARIZATION USING TF-IDF AND COSINE SIMILARITY


In [1]:
# 1. Import Required Libraries
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download necessary datasets for tokenization and stopwords
nltk.download('punkt')
nltk.download('stopwords')

# 2. Define Sample Text
text = """
Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial
intelligence concerned with the interactions between computers and human language, in
particular how to program computers to process and analyze large amounts of natural language
data.
Challenges in natural language processing frequently involve speech recognition, natural
language understanding, and natural language generation.
"""

# 3. Preprocess the Text
# Split the text into sentences
sentences = nltk.sent_tokenize(text)

# Get the set of stopwords in English
stop_words = set(stopwords.words('english'))

# Function to preprocess each sentence by removing stopwords
def preprocess_sentence(sentence):
    return ' '.join([word for word in sentence.split() if word.lower() not in stop_words])

# Preprocess all the sentences
preprocessed_sentences = [preprocess_sentence(sentence) for sentence in sentences]

# 4. Compute TF-IDF Matrix
# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Transform the preprocessed sentences into TF-IDF features
tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)

# 5. Compute Cosine Similarity
# Compute cosine similarity between TF-IDF vectors of the sentences
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 6. Generate Summary
# Function to generate a summary by ranking sentences based on their similarity scores
def generate_summary(sentences, sim_matrix, top_n=2):
    # Compute the sum of similarity scores for each sentence
    scores = sim_matrix.sum(axis=1)
    
    # Rank sentences based on the scores and select the top 'n' sentences
    ranked_sentences = [sentences[i] for i in scores.argsort()[-top_n:]]
    
    # Return the summary as a string
    return ' '.join(ranked_sentences)

# Generate and print the summary
summary = generate_summary(sentences, cosine_sim_matrix)
print("Summary:")
print(summary)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Summary:

Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial
intelligence concerned with the interactions between computers and human language, in
particular how to program computers to process and analyze large amounts of natural language
data. Challenges in natural language processing frequently involve speech recognition, natural
language understanding, and natural language generation.


## b. ABSTRACTIVE TEXT SUMMARIZATION WITH TRANSFORMERS

In [2]:
! pip install transformers datasets



In [3]:
# complte code 7 week 2 question

# 1. Install required libraries (run this in your environment first)
# !pip install transformers datasets

# 2. Import Required Libraries
from transformers import BartForConditionalGeneration, BartTokenizer
from datasets import load_dataset

# 3. Load the Dataset
# Load the CNN/DailyMail dataset (test split, 1% for demonstration purposes)
dataset = load_dataset('cnn_dailymail', '3.0.0', split='test[:1%]')

# 4. Load Pre-trained BART Model and Tokenizer
# Load pre-trained BART model and tokenizer
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# 5. Summarize Text
# Function to summarize text
def summarize(text):
    # Tokenize the input text
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
    
    # Generate the summary
    summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    
    # Decode the generated summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Sample Input and Output
# Summarize a few sample articles from the dataset
for i in range(3):  # Loop through first 3 samples for demonstration
    article = dataset[i]['article']
    print(f"Original Text {i+1}: {article}\n")
    
    # Generate and print the summary
    summary = summarize(article)
    print(f"Summary {i+1}: {summary}\n")


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Original Text 1: (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesd

# WEEK 8: VII. TEXT ENTAILMENT APPLICATIONS IN PYTHON
## a. BASIC TEXT ENTAILMENT USING SIMPLE RULE-BASED METHODS

In [5]:
# 1. Import necessary libraries and load dataset
from datasets import load_dataset
import pandas as pd
import nltk
from sklearn.metrics import accuracy_score

# Download NLTK tokenizers
nltk.download('punkt')

# Sample dataset (as CNN/DailyMail isn't well-suited for entailment tasks, we'll create sample pairs)
data = pd.DataFrame({
    'sentence1': ["The cat is on the mat.", "The sun is shining brightly.", "The game is over."],
    'sentence2': ["The mat has a cat.", "The sky is bright.", "The players are done playing."],
    'label': [False, True, True]  # Labels for entailment (True/False)
})

# 2. Preprocess the data: tokenize and convert to lowercase
def preprocess(text):
    return nltk.word_tokenize(text.lower())

# Apply preprocessing to both sentences
data['sentence1_tokens'] = data['sentence1'].apply(preprocess)
data['sentence2_tokens'] = data['sentence2'].apply(preprocess)

# 3. Define simple rule-based method for text entailment
def simple_rule_based_entailment(s1, s2):
    return set(s2).issubset(set(s1))

# Apply the rule-based entailment check
data['prediction'] = data.apply(lambda row: simple_rule_based_entailment(row['sentence1_tokens'], row['sentence2_tokens']), axis=1)

# 4. Evaluate the model
accuracy = accuracy_score(data['label'], data['prediction'])
print(f'Accuracy: {accuracy}')

# Output the data for reference
print(data[['sentence1', 'sentence2', 'label', 'prediction']])

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Accuracy: 0.3333333333333333
                      sentence1                      sentence2  label  \
0        The cat is on the mat.             The mat has a cat.  False   
1  The sun is shining brightly.             The sky is bright.   True   
2             The game is over.  The players are done playing.   True   

   prediction  
0       False  
1       False  
2       False  


## b.NATURAL LANGUAGE INFERENCE WITH BERT

In [6]:
# Step 1: Import Required Libraries
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

# Step 2: Load the Dataset
dataset = load_dataset('snli')

# Check the first few examples to understand the structure
print(dataset['train'].features)  # Check the features of the training dataset
print(dataset['train'][0:5])       # Print the first 5 examples from the training dataset

# Step 3: Preprocess the Data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocess_function(examples):
    return tokenizer(examples['premise'], examples['hypothesis'], truncation=True, padding='max_length', max_length=128)

# Apply preprocessing to the dataset (train, validation, and test splits)
encoded_dataset = dataset.map(preprocess_function, batched=True)

# Check the structure of the dataset again
print(encoded_dataset)

# Step 4: Inspect the label column directly to understand its structure
print("Label examples:")
print(encoded_dataset['train']['label'][0:5])  # Print the first 5 labels

# Step 5: Identify unique labels
unique_labels = set(encoded_dataset['train']['label'])
print(f"Unique labels in the dataset: {unique_labels}")

# Step 6: Define label mapping and handle unexpected labels
label_dict = {0: 0, 1: 1, 2: 2}  # Adjust this as necessary based on your labels

# Step 7: Map the labels correctly, handle unexpected labels
def map_labels(example):
    # Use the label_dict for mapping, and set a default for unexpected labels
    label = example['label']
    return {'labels': label_dict.get(label, -1)}  # Map to -1 if the label is unexpected

encoded_dataset = encoded_dataset.map(map_labels)

# Set the format for PyTorch
encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Step 8: Load the Pre-Trained BERT Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Step 9: Set Up Training Arguments and Trainer
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    evaluation_strategy='epoch',     # Evaluation during each epoch
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    num_train_epochs=3,              # Number of training epochs
    weight_decay=0.01,               # Strength of L2 regularization
    logging_dir='./logs',            # Directory for logs
)

# Initialize the Trainer with the model, training arguments, and datasets
trainer = Trainer(
    model=model,                         # The BERT model for training
    args=training_args,                  # Training arguments
    train_dataset=encoded_dataset['train'],  # Training dataset
    eval_dataset=encoded_dataset['validation'],  # Validation dataset
)

# Step 10: Train the Model
trainer.train()

# Step 11: Evaluate the Model
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

# Step 12: Make Predictions
premise = "A man inspects the uniform of a figure in some East Asian country."
hypothesis = "The man is sleeping."

# Tokenize the input example
inputs = tokenizer(premise, hypothesis, return_tensors='pt', padding=True, truncation=True, max_length=128)

# Get model prediction
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    predicted_label = torch.argmax(outputs.logits).item()

# Convert prediction to human-readable label
label_map = {0: 'entailment', 1: 'contradiction', 2: 'neutral'}
print(f"Predicted Label: {label_map[predicted_label]}")

README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/412k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/413k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/550152 [00:00<?, ? examples/s]

{'premise': Value(dtype='string', id=None), 'hypothesis': Value(dtype='string', id=None), 'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None)}
{'premise': ['A person on a horse jumps over a broken down airplane.', 'A person on a horse jumps over a broken down airplane.', 'A person on a horse jumps over a broken down airplane.', 'Children smiling and waving at camera', 'Children smiling and waving at camera'], 'hypothesis': ['A person is training his horse for a competition.', 'A person is at a diner, ordering an omelette.', 'A person is outdoors, on a horse.', 'They are smiling at their parents', 'There are children present'], 'label': [1, 2, 0, 1, 0]}


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/550152 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 550152
    })
})
Label examples:
[1, 2, 0, 1, 0]
Unique labels in the dataset: {0, 1, 2, -1}


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/550152 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111341198889022, max=1.0)…

Epoch,Training Loss,Validation Loss


/usr/local/src/pytorch/aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [1,0,0] Assertion `t >= 0 && t < n_classes` failed.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


## WEEK 9: VIII. WORD AND SENTENCE EMBEDDING 
## a. BASIC WORD EMBEDDINGS WITH TF-IDF

In [7]:
#a
# Step 1: Import Required Libraries
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 2: Load the Dataset
newsgroups = fetch_20newsgroups(subset='train')
texts = newsgroups.data  # Extract the document texts

# Step 3: Preprocess the Text Data
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_tfidf = vectorizer.fit_transform(texts)  # Fit and transform the text data

# Step 4: Explore the TF-IDF Matrix
print("TF-IDF matrix shape:", X_tfidf.shape)  # Display shape of the matrix
X_dense = X_tfidf.todense()  # Convert to dense format for better visualization
print(X_dense[0])  # Print the first document's TF-IDF vector


TF-IDF matrix shape: (11314, 1000)
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.12190754 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0. 

## b. GENERATING WORD EMBEDDINGS USING WORD2VEC AND GLOVE

In [8]:
# Import necessary libraries
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
import re

# Load a sample corpus (For demonstration, we'll use some random sentences)
corpus = [
    "This is a sample document.",
    "Another example document.",
    "Word embeddings capture semantic relationships.",
    "GloVe and Word2Vec are popular embedding methods."
]

# Preprocess the text data (Tokenize and remove stop words)
stop_words = stopwords.words('english')

def preprocess(text):
    # Remove special characters, convert to lowercase, and tokenize
    return [word for word in simple_preprocess(text) if word not in stop_words]

# Apply preprocessing to the corpus
tokenized_corpus = [preprocess(doc) for doc in corpus]

# Display tokenized text
print(tokenized_corpus)


[['sample', 'document'], ['another', 'example', 'document'], ['word', 'embeddings', 'capture', 'semantic', 'relationships'], ['glove', 'word', 'vec', 'popular', 'embedding', 'methods']]


In [3]:
# Train word2vex model

# Import necessary library
from gensim.models import Word2Vec

# Train Word2Vec model (Skip-gram model, vector_size=100, window=5)
word2vec_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, sg=1, min_count=1)

# Display the vector for a sample word (e.g., "document")
word_vector = word2vec_model.wv['document']
print(f"Word2Vec vector for 'document': {word_vector}")

# Find similar words to 'document'
similar_words = word2vec_model.wv.most_similar('document')
print(f"Words similar to 'document': {similar_words}")


Word2Vec vector for 'document': [-5.3622725e-04  2.3643136e-04  5.1033497e-03  9.0092728e-03
 -9.3029495e-03 -7.1168090e-03  6.4588725e-03  8.9729885e-03
 -5.0154282e-03 -3.7633716e-03  7.3805046e-03 -1.5334714e-03
 -4.5366134e-03  6.5540518e-03 -4.8601604e-03 -1.8160177e-03
  2.8765798e-03  9.9187379e-04 -8.2852151e-03 -9.4488179e-03
  7.3117660e-03  5.0702621e-03  6.7576934e-03  7.6286553e-04
  6.3508903e-03 -3.4053659e-03 -9.4640139e-04  5.7685734e-03
 -7.5216377e-03 -3.9361035e-03 -7.5115822e-03 -9.3004224e-04
  9.5381187e-03 -7.3191668e-03 -2.3337686e-03 -1.9377411e-03
  8.0774371e-03 -5.9308959e-03  4.5162440e-05 -4.7537340e-03
 -9.6035507e-03  5.0072931e-03 -8.7595852e-03 -4.3918253e-03
 -3.5099984e-05 -2.9618145e-04 -7.6612402e-03  9.6147433e-03
  4.9820580e-03  9.2331432e-03 -8.1579173e-03  4.4957981e-03
 -4.1370760e-03  8.2453608e-04  8.4986202e-03 -4.4621765e-03
  4.5175003e-03 -6.7869602e-03 -3.5484887e-03  9.3985079e-03
 -1.5776526e-03  3.2137157e-04 -4.1406299e-03 -7.6826

In [9]:
import numpy as np

# Load pre-trained GloVe vectors (assuming 'glove.6B.100d.txt' is downloaded and available)
def load_glove_model(glove_file):
    glove_model = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array([float(val) for val in split_line[1:]])
            glove_model[word] = embedding
    return glove_model

# Load GloVe model (Provide path to your downloaded GloVe file)
glove_file = '/kaggle/input/glove6b100dtxt/glove.6B.100d.txt'  # Ensure this file is downloaded and placed in the working directory
glove_model = load_glove_model(glove_file)

# Display GloVe vector for a word (e.g., "document")
print(f"GloVe vector for 'document': {glove_model.get('document')}")

# Calculate similarity between words (cosine similarity)
from numpy.linalg import norm

def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))

word1, word2 = 'document', 'sample'
similarity = cosine_similarity(glove_model.get(word1), glove_model.get(word2))
print(f"Cosine similarity between '{word1}' and '{word2}': {similarity}")


GloVe vector for 'document': [-2.7285e-01 -9.6449e-02  4.1131e-01  3.7925e-01  8.9352e-01  4.5227e-01
  1.9478e-01 -3.6985e-01  5.9704e-01  1.3387e-01  4.2878e-01 -2.8012e-01
  2.0141e-01 -1.9995e-02 -6.2983e-02  7.1399e-01  8.9025e-01 -3.1009e-01
 -1.9911e-01 -4.6591e-01 -8.8145e-01 -5.4318e-01 -5.2839e-01  7.0794e-02
 -3.1042e-01 -9.8677e-01  1.0283e-01  1.6911e-01 -4.4878e-01  1.6171e-01
  3.9394e-01  1.2655e-01 -1.2540e-01 -6.6462e-02 -1.2977e-01 -3.9406e-02
  4.4811e-02 -4.2534e-01  2.6742e-02 -3.8609e-01 -8.4547e-01 -6.4412e-02
  6.8974e-01  2.4521e-01 -7.3434e-01 -7.7389e-01 -1.5336e-01 -2.9057e-01
 -6.8358e-01 -3.8785e-01  1.2230e+00  1.7723e-01  1.6004e-01  8.3723e-01
 -3.1238e-01 -1.3138e+00 -2.6000e-01 -4.8754e-01  1.6751e+00  1.7320e-01
 -2.9494e-01  1.6038e-01 -5.3087e-01 -9.0950e-01  6.7436e-01 -5.2625e-01
 -3.0406e-01  8.5552e-01 -2.6879e-01 -9.0492e-01  3.0380e-01  2.0591e-01
  3.3439e-01 -6.2308e-01  6.4306e-02  2.2179e-01 -9.2076e-02  2.1894e-01
 -1.4015e+00 -4.4588e-

## WEEK 10: IX. QUESTION ANSWERING
## a. BASIC Q&A SYSTEM USING KEYWORD MATCHING

data set was created manually 

filte name = week 10 .json
What is Python
content:

"root":{5 items
"What is Python?":string"Python is a high-level programming language."
"What is Machine Learning?":string"Machine learning is a subset of artificial intelligence that focuses on building systems that learn from data."
"What is the capital of France?":string"The capital of France is Paris."
"Who is the president of the United States?":string"The current president of the United States is Joe Biden."
"How does the internet work?":string"The internet is a global network of computers that communicate using standardized protocols like TCP/IP."
}

In [12]:
import json

# Step 1: Load the Dataset
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)["root"]

# Step 2: Exact Matching Function with Normalization
def normalize_text(text):
    # Convert text to lowercase and strip leading/trailing spaces
    return text.lower().strip()

def find_answer(question, qa_data):
    normalized_question = normalize_text(question)
    
    # Iterate through the predefined questions and check for an exact match
    for q, a in qa_data.items():
        normalized_q = normalize_text(q)
        if normalized_q == normalized_question:  # Check if the question matches exactly
            return a
    
    return "Sorry, I don't know the answer to that question."

# Step 3: Main Interaction Loop
def main():
    # Load the Q&A dataset
    qa_data = load_dataset('/kaggle/input/week10-dataset/week_10.json')
    
    # Start a loop for user interaction
    while True:
        user_question = input("Ask a question: ").strip()  # Get input from the user
        if user_question.lower() in ['exit', 'quit']:  # Exit if user types 'exit' or 'quit'
            print("Goodbye!")
            break
        answer = find_answer(user_question, qa_data)  # Find the best answer
        print("Answer:", answer)

# Step 4: Run the Q&A system
if __name__ == "__main__":
    main()



Ask a question:  What is Python?


Answer: Python is a high-level programming language.


Ask a question:  What is Machine Learning?


Answer: Machine learning is a subset of artificial intelligence that focuses on building systems that learn from data.


Ask a question:  quit


Goodbye!


## b. BUILDING A Q&A SYSTEM WITH BERT

In [13]:
! pip install transformers torch tokenizers

  pid, fd = os.forkpty()




In [15]:
#: Importing the Necessary Libraries

from transformers import BertForQuestionAnswering, BertTokenizer
import torch
# Loading the Pre-trained BERT Model and Tokenizer

# Load the pre-trained BERT tokenizer and model for question answering
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
#  Function to Answer Questions Using BERT

def answer_question(question, context):
    # Tokenize the input question and context
    inputs = tokenizer.encode_plus(question, context, return_tensors='pt')
    
    # Get model's predicted start and end positions of the answer
    with torch.no_grad():
        outputs = model(**inputs)
        start_scores = outputs.start_logits
        end_scores = outputs.end_logits

    # Get the most likely start and end token positions
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)

    # Convert token indices to tokens and then join them into a single string answer
    answer_tokens = inputs['input_ids'][0][start_index: end_index + 1]
    answer = tokenizer.decode(answer_tokens)

    return answer
def main():
    # Get user input for context once
    context = input("\nProvide the context (paragraph): ")

    while True:
        # Get user input for the question
        question = input("Ask a question (or type 'exit' to quit): ")

        # Exit the system if the user types 'exit'
        if question.lower() in ['exit', 'quit']:
            print("Exiting the Q&A system.")
            break

        # Get the answer from the BERT model
        answer = answer_question(question, context)
        print(f"Answer: {answer}")

if __name__ == "__main__":
    main()


Provide the context (paragraph):  Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.[1] Quick progress in the field of deep learning, beginning in 2010s, allowed neural networks to surpass many previous approaches in performance
Ask a question (or type 'exit' to quit):  what is machine learning


Answer: a field of study in artificial intelligence


Ask a question (or type 'exit' to quit):  quit


Exiting the Q&A system.


## WEEK 11: X. MACHINE TRANSLATION
## a. BASIC MACHINE TRANSLATION USING RULE-BASED METHODS


In [17]:
#Step 1: Define the Bilingual Dictionary

# Bilingual dictionary (English to French)
dictionary = {
    'hello': 'bonjour',
    'world': 'monde',
    'my': 'mon',
    'name': 'nom',
    'is': 'est',
    'good': 'bon',
    'morning': 'matin',
    'thank': 'merci',
    'you': 'vous',
    'goodbye': 'au revoir'
}

# step 2 : Define Basic Grammar Rules

# Basic grammar rule: Subject-Verb-Object (SVO)
grammar_rules = {
    'SVO': ['subject', 'verb', 'object']
}

#Step 3: Translation Function


def translate(sentence):
    # Convert sentence to lowercase and split it into words
    words = sentence.lower().split()
    
    # Translate each word using the dictionary; if the word is not in the dictionary, keep it unchanged
    translated_words = [dictionary.get(word, word) for word in words]
    
    # Join the translated words back into a sentence
    return ' '.join(translated_words)



In [18]:
# Example usage
sentence = "Hello world"
print(translate(sentence))  # Output: bonjour monde

bonjour monde


In [19]:
sentence2 = "Good morning"
print(translate(sentence2))

bon matin


In [20]:
sentence3 = "Thank you"
print(translate(sentence3)) 

merci vous


## b. ENGLISH TO FRENCH TRANSLATION USING SEQ2SEQ WITH ATTENTION

In [21]:
! pip install --upgrade tensorflow-datasets



In [1]:
import tensorflow as tf
import pandas as pd
import tensorflow_datasets as tfds

# Step 1: Load dataset from CSV using Pandas
data_path = '/kaggle/input/wmt-2014-english-french/wmt14_translate_fr-en_test.csv'
data = pd.read_csv(data_path)

# Check the first few rows and the column names of the dataframe
print(data.head())
print("Columns in the DataFrame:", data.columns.tolist())  # Print the actual column names

# Ensure the dataframe contains the required columns
expected_columns = ['en', 'fr']
assert all(col in data.columns for col in expected_columns), f"CSV must contain {expected_columns} columns"

# Step 2: Convert the DataFrame to a TensorFlow Dataset
# Create a TensorFlow dataset from the DataFrame
train_dataset = tf.data.Dataset.from_tensor_slices((data['en'].values, data['fr'].values))

# Print the first example to verify conversion
for english, french in train_dataset.take(1):
    print(f'English: {english.numpy().decode("utf-8")}, French: {french.numpy().decode("utf-8")}')

# Optional: Define constants for batch size and max length
BATCH_SIZE = 64
MAX_LENGTH = 40

# Optional: Tokenization process
# Tokenizer setup for input (English) and output (French)
tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (en.numpy() for en, fr in train_dataset), target_vocab_size=2**13)
tokenizer_fr = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (fr.numpy() for en, fr in train_dataset), target_vocab_size=2**13)

# Encoding function
def encode(en_t, fr_t):
    en_t = [tokenizer_en.vocab_size] + tokenizer_en.encode(en_t.numpy().decode('utf-8')) + [tokenizer_en.vocab_size + 1]
    fr_t = [tokenizer_fr.vocab_size] + tokenizer_fr.encode(fr_t.numpy().decode('utf-8')) + [tokenizer_fr.vocab_size + 1]
    return en_t, fr_t

def tf_encode(en_t, fr_t):
    return tf.py_function(encode, [en_t, fr_t], [tf.int64, tf.int64])

# Prepare the dataset with encoding
train_dataset = train_dataset.map(tf_encode)

# Filter sequences longer than MAX_LENGTH
def filter_max_length(en, fr, max_length=MAX_LENGTH):
    return tf.logical_and(tf.size(en) <= max_length, tf.size(fr) <= max_length)

train_dataset = train_dataset.filter(filter_max_length)

# Shuffle and batch the dataset
train_dataset = train_dataset.shuffle(20000).padded_batch(BATCH_SIZE, padded_shapes=([None], [None]))
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

# Print the first training example after processing
for en, fr in train_dataset.take(1):
    print(f'Encoded English: {en.numpy()}')
    print(f'Encoded French: {fr.numpy()}')

                                                  en  \
0              Spectacular Wingsuit Jump Over Bogota   
1  Sportsman Jhonathan Florez jumped from a helic...   
2  Wearing a wingsuit, he flew past over the famo...   
3                           A black box in your car?   
4  As America's road planners struggle to find th...   

                                                  fr  
0  Spectaculaire saut en "wingsuit" au-dessus de ...  
1  Le sportif Jhonathan Florez a sauté jeudi d'un...  
2  Equipé d'un wingsuit (une combinaison munie d'...  
3               Une boîte noire dans votre voiture ?  
4  Alors que les planificateurs du réseau routier...  
Columns in the DataFrame: ['en', 'fr']
English: Spectacular Wingsuit Jump Over Bogota, French: Spectaculaire saut en "wingsuit" au-dessus de Bogota
Encoded English: [[7639 2326   14 ... 7498 7429 7640]
 [7639   61    1 ...    0    0    0]
 [7639 6971 3706 ...    0    0    0]
 ...
 [7639   74  150 ...    0    0    0]
 [7639 1536   1

## WEEK 12: XI.DIALOGUE SYSTEM
## a. BASIC RULE-BASED CHATBOT USING PYTHON NLTK

In [25]:
# Step 2: Import Libraries
import nltk
from nltk.chat.util import Chat, reflections

# Step 3: Define Rules (Predefined pairs)
pairs = [
    (r"my name is (.*)", ["Hello %1, How are you today?"]),
    (r"hi|hey|hello", ["Hello", "Hey there"]),
    (r"what is your name?", ["I am a bot created by [Your Name]."]),
    (r"how are you?", ["I'm doing good. How about you?"]),
    (r"sorry (.*)", ["No problem", "It's okay", "You don't need to be sorry"]),
    (r"quit", ["Bye! Take care."])
]

# Step 4: Create the Chatbot
def chatbot():
    print("Hi, I'm the chatbot you created. Type 'quit' to exit.") 
    chat = Chat(pairs, reflections)
    chat.converse()
    
# Step 5: Run the Chatbot
if __name__ == "__main__":
    chatbot()

Hi, I'm the chatbot you created. Type 'quit' to exit.


> hello


Hello


> how are you?


I'm doing good. How about you?


> quit


Bye! Take care.


## b. BUILDING A CHATBOT USING SEQ2SEQ MODELS

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Step 1: Load and Preprocess the Dataset
def load_data(filepath):
    try:
        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
            lines = f.readlines()
    except Exception as e:
        print(f"Error reading the file: {e}")
        return []

    conversations = []
    for line in lines:
        line_parts = line.strip().split(' +++$+++ ')
        if len(line_parts) == 5:
            conversations.append(line_parts[4])  # Store only the dialogue part

    print(f"Loaded {len(conversations)} conversations.")  # Debug info
    return conversations

def create_pairs(conversations):
    input_texts = []
    target_texts = []

    for i in range(len(conversations) - 1):
        input_text = conversations[i]
        target_text = conversations[i + 1]
        target_text = '\t' + target_text + '\n'  # Add start and end tokens
        input_texts.append(input_text)
        target_texts.append(target_text)

    print(f"Created {len(input_texts)} input-target pairs.")  # Debug info
    return input_texts, target_texts

# Load the dataset (replace with the correct path to movie_lines.txt)
conversations = load_data('/kaggle/input/week12-dataset/movie_lines_rev 2.txt')  # Make sure this file exists
input_texts, target_texts = create_pairs(conversations)

# Check if input_texts and target_texts are populated
if not input_texts or not target_texts:
    raise ValueError("No input or target texts were created. Please check the dataset.")

# Step 2: Tokenize and Pad the Data
# Tokenize the input and output data
input_tokenizer = Tokenizer()
target_tokenizer = Tokenizer()

input_tokenizer.fit_on_texts(input_texts)
target_tokenizer.fit_on_texts(target_texts)

input_sequences = input_tokenizer.texts_to_sequences(input_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)

# Pad sequences to ensure uniform length
max_encoder_seq_length = max(len(seq) for seq in input_sequences) if input_sequences else 0
max_decoder_seq_length = max(len(seq) for seq in target_sequences) if target_sequences else 0

encoder_input_data = pad_sequences(input_sequences, maxlen=max_encoder_seq_length, padding='post')
decoder_input_data = pad_sequences(target_sequences, maxlen=max_decoder_seq_length, padding='post')

# Prepare decoder output data
decoder_output_data = np.zeros((len(target_sequences), max_decoder_seq_length, len(target_tokenizer.word_index) + 1), dtype='float32')

for i, seq in enumerate(target_sequences):
    for t, word_idx in enumerate(seq):
        if t > 0:
            decoder_output_data[i, t - 1, word_idx] = 1.0

# Step 3: Build the Seq2Seq Model
num_encoder_tokens = len(input_tokenizer.word_index) + 1
num_decoder_tokens = len(target_tokenizer.word_index) + 1

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_dim=num_encoder_tokens, output_dim=256)(encoder_inputs)
encoder_lstm = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

# Save the encoder states to pass to the decoder
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=num_decoder_tokens, output_dim=256)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Step 4: Compile and Train the Model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model (adjust epochs and batch size as needed)
model.fit([encoder_input_data, decoder_input_data], decoder_output_data, batch_size=64, epochs=100)

# Step 5: Inference Setup (for generating responses)
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# Step 6: Decode a Sequence (Generate a Response)
def decode_sequence(input_seq):
    # Encode the input as state vectors
    states_value = encoder_model.predict(input_seq)

    # Generate an empty target sequence with only the start token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = target_tokenizer.word_index['\t']

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample the next token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = target_tokenizer.index_word.get(sampled_token_index, '')
        decoded_sentence += sampled_char

        if (sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence and states
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()  # Trim any extra whitespace

# Step 7: Test the Chatbot
def chat():
    print("Chatbot is ready! Type 'quit' to exit.")
    while True:
        input_text = input("You: ")
        if input_text.lower() == 'quit':
            print("Exiting the chat. Goodbye!")
            break

        input_sequence = input_tokenizer.texts_to_sequences([input_text])
        input_sequence = pad_sequences(input_sequence, maxlen=max_encoder_seq_length, padding='post')
        response = decode_sequence(input_sequence)
        print(f"Bot: {response}")

if __name__ == "__main__":
    chat()

Loaded 1153 conversations.
Created 1152 input-target pairs.
Epoch 1/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 58ms/step - accuracy: 7.4284e-04 - loss: 0.4994
Epoch 2/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step - accuracy: 0.0015 - loss: 0.4596
Epoch 3/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - accuracy: 0.0030 - loss: 0.4852
Epoch 4/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - accuracy: 0.0026 - loss: 0.4477
Epoch 5/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - accuracy: 0.0028 - loss: 0.4324
Epoch 6/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - accuracy: 0.0028 - loss: 0.4557
Epoch 7/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - accuracy: 0.0029 - loss: 0.4740
Epoch 8/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - accuracy

You:  hi


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step


KeyError: '\t'