In [21]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
%cd /content/drive/MyDrive/Colab Notebooks/NCAI Chatbot/BART

/content/drive/MyDrive/Colab Notebooks/NCAI Chatbot/BART


In [23]:
# !pip install virtualenv
# !virtualenv venv
!pip install accelerate -U
!pip install transformers
# !source venv/bin/activate



In [24]:
import pandas as pd
import re

In [25]:
df = pd.read_csv('./400k.csv', dtype=str, low_memory=False)
df = df.iloc[:, :2]
columns = ['Question', 'Answer']
df.columns = columns

# Use regular expressions to remove patterns like [Q614] and [A614]
df = df.applymap(lambda x: re.sub(r'\[\w+\d+\]', '', x) if isinstance(x, str) else x)

df


DataFrame.applymap has been deprecated. Use DataFrame.map instead.



Unnamed: 0,Question,Answer
0,How would you define anomaly detection?,Anomaly detection refers to finding patterns i...
1,Can you give an overview of anomaly detection?,Anomaly detection involves spotting unusual in...
2,Describe the concept of anomaly detection.,Anomaly detection entails detecting outliers o...
3,What is the purpose of anomaly detection?,Anomaly detection aims to pinpoint irregularit...
4,Why is anomaly detection important in data ana...,Anomaly detection is crucial for identifying p...
...,...,...
400046,How does stemming impact the recognition of s...,Stemming may simplify the recognition of sent...
400047,Can lemmatization be applied to maintain the ...,Lemmatization may face challenges in maintain...
400048,How does stemming contribute to the efficienc...,Stemming improves the efficiency of informati...
400049,Can lemmatization be adapted for languages wi...,"Yes, lemmatization can be adapted for languag..."


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400051 entries, 0 to 400050
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   Question  367720 non-null  object
 1   Answer    367720 non-null  object
dtypes: object(2)
memory usage: 6.1+ MB


In [27]:
# Check if there are any null values in the entire DataFrame
any_null = df.isnull().any().any()

# Print the result
print(f"Are there any null values in the DataFrame? {any_null}")


Are there any null values in the DataFrame? True


In [28]:
# Check for null values in 'Question' or 'Answer' columns
null_rows = df[df['Question'].isnull() | df['Answer'].isnull()]

# Print the rows with null values
print("Rows with null values:")
print(null_rows)

Rows with null values:
                                                 Question Answer
958                                                   NaN    NaN
959                                                   NaN    NaN
960                                                   NaN    NaN
961                                                   NaN    NaN
962                                                   NaN    NaN
...                                                   ...    ...
126519                                                  <    NaN
186007                                                       NaN
271635   How can Porter Stemmer be implemented in Python?    NaN
271900  How can we implement the Snowball Stemmer in P...    NaN
271930  How can we implement Lancaster Stemmer in Python?    NaN

[32342 rows x 2 columns]


In [29]:
# Drop rows with null values in 'Question' or 'Answer' columns
df.dropna(subset=['Question', 'Answer'], inplace=True)
df

Unnamed: 0,Question,Answer
0,How would you define anomaly detection?,Anomaly detection refers to finding patterns i...
1,Can you give an overview of anomaly detection?,Anomaly detection involves spotting unusual in...
2,Describe the concept of anomaly detection.,Anomaly detection entails detecting outliers o...
3,What is the purpose of anomaly detection?,Anomaly detection aims to pinpoint irregularit...
4,Why is anomaly detection important in data ana...,Anomaly detection is crucial for identifying p...
...,...,...
400046,How does stemming impact the recognition of s...,Stemming may simplify the recognition of sent...
400047,Can lemmatization be applied to maintain the ...,Lemmatization may face challenges in maintain...
400048,How does stemming contribute to the efficienc...,Stemming improves the efficiency of informati...
400049,Can lemmatization be adapted for languages wi...,"Yes, lemmatization can be adapted for languag..."


In [30]:
df = df.head(100)
df

Unnamed: 0,Question,Answer
0,How would you define anomaly detection?,Anomaly detection refers to finding patterns i...
1,Can you give an overview of anomaly detection?,Anomaly detection involves spotting unusual in...
2,Describe the concept of anomaly detection.,Anomaly detection entails detecting outliers o...
3,What is the purpose of anomaly detection?,Anomaly detection aims to pinpoint irregularit...
4,Why is anomaly detection important in data ana...,Anomaly detection is crucial for identifying p...
...,...,...
95,How do network anomalies and application perfo...,Network anomalies impact network traffic and c...
96,What differentiates network anomalies and appl...,Network anomalies are monitored to detect irre...
97,Why is it important to distinguish between net...,Distinguishing between network anomalies and a...
98,Can you provide examples of web application se...,Examples of web application security anomalies...


In [31]:
from transformers import BartTokenizer

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

# Define the tokenize function
def tokenize_function(examples):
    if isinstance(examples['Question'], str):
        return tokenizer(examples['Question'], padding="longest", truncation=True, return_tensors="pt")
    else:
        print(f"Non-string or NaN value found: {examples['Question']}")
        return None

tokenized_dataset = df.apply(tokenize_function, axis=1)

tokenized_dataset

Unnamed: 0,0
0,"[input_ids, attention_mask]"
1,"[input_ids, attention_mask]"
2,"[input_ids, attention_mask]"
3,"[input_ids, attention_mask]"
4,"[input_ids, attention_mask]"
...,...
95,"[input_ids, attention_mask]"
96,"[input_ids, attention_mask]"
97,"[input_ids, attention_mask]"
98,"[input_ids, attention_mask]"


In [32]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq

# Load the BART tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        inputs = tokenizer(item['Question'], padding="longest", truncation=True, return_tensors="pt")
        labels = tokenizer(item['Answer'], padding="longest", truncation=True, return_tensors="pt").input_ids
        return {
            'input_ids': inputs.input_ids.flatten(),
            'attention_mask': inputs.attention_mask.flatten(),
            'labels': labels.flatten()
        }
dataset = CustomDataset(df)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [33]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    fp16=True
)

# Initialize the model
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

# Initialize Trainer
trainer = Trainer(
    model=model ,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator
)

trainer.train()

Step,Training Loss
10,15.6318
20,10.7991
30,5.5175
40,4.3581
50,3.7918
60,3.0993
70,3.1653






TrainOutput(global_step=75, training_loss=6.339216219584147, metrics={'train_runtime': 75.3039, 'train_samples_per_second': 3.984, 'train_steps_per_second': 0.996, 'total_flos': 9493780512768.0, 'train_loss': 6.339216219584147, 'epoch': 3.0})

In [34]:
import plotly.express as px

# Extract training log history
history = trainer.state.log_history

# Convert history to DataFrame
history_df = pd.DataFrame(history)

# Plot training loss
fig = px.line(history_df, x=history_df.index, y="loss", title='Training Loss over Time')
fig.show()

# Plot evaluation loss
if 'eval_loss' in history_df.columns:
    fig = px.line(history_df, x=history_df.index, y="eval_loss", title='Evaluation Loss over Time')
    fig.show()


In [35]:
%cd /content/drive/MyDrive/Colab Notebooks/NCAI Chatbot/Model Params

/content/drive/MyDrive/Colab Notebooks/NCAI Chatbot/Model Params


In [36]:
model.save_pretrained('./fine-tuned-bart')
tokenizer.save_pretrained('./fine-tuned-bart')

('./fine-tuned-bart/tokenizer_config.json',
 './fine-tuned-bart/special_tokens_map.json',
 './fine-tuned-bart/vocab.json',
 './fine-tuned-bart/merges.txt',
 './fine-tuned-bart/added_tokens.json')

In [37]:
del model
del tokenizer
del trainer
del dataset
del data_collator
del training_args
del df
del tokenized_dataset

In [38]:
from transformers import BartTokenizer, BartForConditionalGeneration

# Load the fine-tuned model and tokenizer
tokenizer = BartTokenizer.from_pretrained('./fine-tuned-bart')
model = BartForConditionalGeneration.from_pretrained('./fine-tuned-bart')

def generate_response(question):
    inputs = tokenizer(question, return_tensors="pt")
    outputs = model.generate(inputs.input_ids, max_length=300, num_beams=5, early_stopping=True)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Example usage
question = "anomaly detection?"
print(generate_response(question))

Anomaly detection techniques include unsupervised, supervised, and supervised methods.


In [41]:
# Example usage
question = "What is the purpose of anomaly detection?"
print(generate_response(question))

The purpose of anomaly detection is to differentiate between normal, abnormal, and abnormal instances.


In [40]:
# Example usage
question = "concept of anomaly detection"
print(generate_response(question))

concept of anomaly detection is that anomalies are rare events that deviate significantly from expected patterns.
