# import data and process it

In [None]:
import pandas as pd

In [None]:
# Load the dataset containing questions and answers from a CSV file

data = pd.read_csv('QAs.csv')

In [None]:
# Display the first 30 rows of the dataset for an initial check

data.head(30)

In [None]:
import re

# Create a list of (question, answer) pairs
faq_pairs = list(zip(data['Question'], data['Answer']))

# Print the first few question-answer pairs to verify correctness
print(faq_pairs[:5])

# Determine the number of unique questions in the dataset

unique_questions = data['Question'].unique()
NUM_CLASSES = len(unique_questions)

print(f"Number of unique questions: {NUM_CLASSES}")

# Function to preprocess text
def preprocess_text(text):
    # Remove special characters, numbers, etc., and retain only letters and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase and strip spcaces
    text = text.lower().strip()
    return text

# Apply the preprocessing function to clean the 'Question' and 'Answer' columns
data['cleaned_question'] = data['Question'].apply(preprocess_text)
data['cleaned_answer'] = data['Answer'].apply(preprocess_text)

# Check the first few rows to ensure the text was cleaned properly
print(data[['cleaned_question', 'cleaned_answer']].head())

# Logistic regression with TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Extract questions and answers as input (X) and output (y)
questions = data['cleaned_question'].values
answers = data['cleaned_answer'].values

# Convert questions to numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(questions)
y = answers

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Logistic Regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = log_reg.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:", classification_report(y_test, y_pred))

In [None]:
import pickle

# Save the trained Logistic Regression model to a file
with open('log_reg_model.pkl', 'wb') as model_file:
    pickle.dump(log_reg, model_file)

# Save the trained TF-IDF vectorizer to a file
with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

In [None]:
# Function to predict an answer for a new question
def predict_answer_log(new_question):
    # Load the saved Logistic Regression model
    with open('log_reg_model.pkl', 'rb') as model_file:
        loaded_model = pickle.load(model_file)

    # Load the saved TF-IDF vectorizer
    with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
        loaded_vectorizer = pickle.load(vectorizer_file)

    # Convert the new question into TF-IDF features
    new_question_tfidf = loaded_vectorizer.transform([new_question])

    # Predict the answer using the loaded model
    predicted_answer = loaded_model.predict(new_question_tfidf)

    return predicted_answer[0]

In [None]:
new_question = "How can I detect vulnerabilities in my assets?"
predicted_answer = predict_answer_log(new_question)
print(f"Predicted Answer: {predicted_answer}")

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest Classifier
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_rf = rf_clf.predict(X_test)
print("Accuracy (Random Forest):", accuracy_score(y_test, y_pred_rf))
print("Classification Report (Random Forest):", classification_report(y_test, y_pred_rf))

In [None]:
import pickle

# Save the trained Random Forest model to a file
with open('rand_for_model.pkl', 'wb') as model_file:
    pickle.dump(log_reg, model_file)

In [None]:
# Function to predict an answer for a new question
def predict_answer_rand(new_question):
    # Load the saved Random Forest model
    with open('rand_for_model.pkl', 'rb') as model_file:
        loaded_model = pickle.load(model_file)

    # Load the saved TF-IDF vectorizer
    with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
        loaded_vectorizer = pickle.load(vectorizer_file)

    # Convert the new question into TF-IDF features
    new_question_tfidf = loaded_vectorizer.transform([new_question])

    # Predict the answer using the loaded model
    predicted_answer = loaded_model.predict(new_question_tfidf)

    return predicted_answer[0]

In [None]:
new_question = "How can I detect vulnerabilities in my assets?"
predicted_answer = predict_answer_rand(new_question)
print(f"Predicted Answer: {predicted_answer}")

# pretrained BERT

In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the 'cleaned_question' data
tokens = tokenizer.batch_encode_plus(
    data['cleaned_question'].tolist(),  # Tokenize the cleaned questions
    max_length=128,
    padding='max_length',
    truncation=True,
    return_attention_mask=True
)

# Convert the tokens into tensors
input_ids = tokens['input_ids']  # Token IDs for each question
attention_masks = tokens['attention_mask']  # Attention masks for each question


In [None]:
from sklearn.model_selection import train_test_split
import torch

# Convert lists to tensors
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)

# Dummy labels (You need to replace this with actual labels from your dataset)
labels = torch.tensor([0] * len(input_ids))  # Replace with actual labels

# Split data into training and testing sets
train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, test_size=0.2)
train_masks, test_masks = train_test_split(attention_masks, test_size=0.2)

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=64)

# Create the DataLoader for our test set
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=64)

In [None]:
from transformers import BertForSequenceClassification, AdamW

# Load BERT with a classification head (output layer)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=NUM_CLASSES)  # Set NUM_CLASSES to the number of classes in your dataset

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from tqdm import tqdm

# Set up training loop
epochs = 4
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    model.train()

    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch_input_ids, batch_input_mask, batch_labels = batch

        model.zero_grad()

        outputs = model(batch_input_ids, attention_mask=batch_input_mask, labels=batch_labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

In [None]:
# Evaluate the model
model.eval()

eval_loss, eval_accuracy = 0, 0
for batch in test_dataloader:
    batch_input_ids, batch_input_mask, batch_labels = batch

    with torch.no_grad():
        outputs = model(batch_input_ids, attention_mask=batch_input_mask)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    eval_accuracy += (predictions == batch_labels).sum().item()

eval_accuracy /= len(test_labels)
print(f'Test Accuracy: {eval_accuracy:.2f}')
print(f"Loss: {total_loss / len(train_dataloader)}")

In [None]:
# Save the model
model.save_pretrained('models/bert')

# Save the tokenizer
tokenizer.save_pretrained('models/bert')

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the trained model and tokenizer
model = BertForSequenceClassification.from_pretrained('models/bert')  # Use the path where you saved your model
tokenizer = BertTokenizer.from_pretrained('models/bert')

# Ensure the model is in evaluation mode
model.eval()

def find_best_answer(question, faq_pairs):
    question_cleaned = preprocess_text(question)  # Assuming you have the same preprocess_text function

    best_score = -float('inf')
    best_answer = None

    for (faq_question, faq_answer) in faq_pairs:
        # Preprocess and tokenize the pair (question, possible answer)
        inputs = tokenizer.encode_plus(
            question_cleaned + " [SEP] " + preprocess_text(faq_question),
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Get input_ids and attention_mask tensors
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']

        # Make prediction (logit scores)
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

        # Take the maximum logit as the score
        score = torch.max(logits).item()

        # Update the best answer if this one has a higher score
        if score > best_score:
            best_score = score
            best_answer = faq_answer

    return best_answer

In [None]:
question = "How do I track application licenses?"
best_answer = find_best_answer(question, faq_pairs)

print(f"Best Answer: {best_answer}")

# pretrained SBERT

In [None]:
from sklearn.model_selection import train_test_split
from sentence_transformers import InputExample
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, InputExample, losses,util
from datasets import Dataset
import numpy as np
import torch

In [None]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Split data into training and validation sets (80% train, 20% validation)
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Convert training data into InputExample format
train_examples = [InputExample(texts=[row['Question'], row['Answer']], label=1.0) for _, row in train_data.iterrows()]

# Convert validation data into InputExample format
validation_examples = [InputExample(texts=[row['Question'], row['Answer']], label=1.0) for _, row in val_data.iterrows()]

# Create DataLoader for the training and validation data
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

# Define the loss function for similarity learning
train_loss = losses.CosineSimilarityLoss(model=model)

# Function to evaluate the model
def evaluate_model(model, validation_data, threshold=0.5):
    model.eval()
    predictions = []
    labels = []
    with torch.no_grad():
        for example in validation_data:
            embeddings = model.encode([example.texts[0], example.texts[1]])
            similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
            predictions.append(similarity)
            labels.append(example.label)

    # Adjust threshold and classify based on similarity
    predicted_labels = [1 if sim > threshold else 0 for sim in predictions]
    accuracy = np.mean([1 if pred == true else 0 for pred, true in zip(predicted_labels, labels)])

    # Return detailed metrics
    return accuracy, predictions, predicted_labels, labels

# Fine-tune the model and log metrics
for epoch in range(3):
    print(f'Epoch : {epoch+1}')

    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=1,  # Train for one epoch at a time
        warmup_steps=100
    )

In [None]:
# Evaluate the model on validation data and extract metrics

accuracy, predictions, predicted_labels, labels = evaluate_model(model, validation_examples, threshold=0.7)

accuracy = accuracy_score(labels, predicted_labels)
recall = recall_score(labels, predicted_labels)
f1 = f1_score(labels, predicted_labels)

print(f'Accuracy: {accuracy}')
print(f'recall: {recall}')
print(f'f1 : {f1}')

In [None]:
# Save the fine-tuned model
model.save('sbert_finetuned_model')

# Load the fine-tuned model when needed
model = SentenceTransformer('sbert_finetuned_model')

In [None]:
from sentence_transformers import util
import torch

# Prepare FAQ answers as embeddings
answers = [preprocess_text(answer) for _, answer in data['Answer'].items()]
answer_embeddings = model.encode(answers, convert_to_tensor=True)

# Define a function to find the best answer using the fine-tuned SBERT model
def find_best_answer_sbert(question):
    # Preprocess and encode the input question
    question_cleaned = preprocess_text(question)
    question_embedding = model.encode(question_cleaned, convert_to_tensor=True)

    # Compute cosine similarities between the question and FAQ answers
    cosine_scores = util.pytorch_cos_sim(question_embedding, answer_embeddings)

    # Find the answer with the highest similarity score
    best_score_idx = torch.argmax(cosine_scores).item()
    best_answer = answers[best_score_idx]  # Get the corresponding answer

    return best_answer

In [None]:
question = "How can I detect vulnerabilities in my assets?"
best_answer = find_best_answer_sbert(question)
print(f"Best Answer: {best_answer}")