In [1]:
import math
import os, sys
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from langchain.vectorstores import Chroma
from langchain.document_loaders import WebBaseLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import HuggingFacePipeline, HuggingFaceHub, PromptTemplate, LLMChain
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
os.environ["HUGGINGFACEUG_API_TOKEN"] = "Your Token"

In [3]:

article_ulrs = ["https://www.ibm.com/topics/machine-learning",
                "https://mitsloan.mit.edu/ideas-made-to-matter/machine-learning-explained",
                "https://www.ibm.com/topics/neural-networks",
                "https://www.ibm.com/topics/convolutional-neural-networks", 
                "https://machinelearningmastery.com/what-are-large-language-models/",
                "https://www.techtarget.com/whatis/definition/support-vector-machine-SVM",
                "https://www.ibm.com/topics/recurrent-neural-networks", 
                "https://blogs.nvidia.com/blog/2022/03/25/what-is-a-transformer-model/",
                "https://machinelearningmastery.com/a-brief-introduction-to-bert/",
                "https://kaitchup.substack.com/p/a-gentle-introduction-to-gpt-models-e02b093a495b"
               ]

loader = WebBaseLoader(article_ulrs)
data   = loader.load()

In [4]:
llm = HuggingFacePipeline.from_model_id(model_id="facebook/opt-125m",
                                        task="text-generation", 
                                        model_kwargs={"temperature": 0, "max_length": 2048})

index = VectorstoreIndexCreator(embedding = HuggingFaceEmbeddings()).from_loaders([loader])
index.query("What is answering system?", llm=llm)



Device has 1 GPUs available. Provide device={deviceId} to `from_model_id` to use availableGPUs for execution. deviceId is -1 (default) for CPU and can be a positive integer associated with CUDA device id.
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


'\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:\nAnswer:

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 50)
all_splits = text_splitter.split_documents(data)

In [6]:
def batch(docs, b_size=100):
    length = len(docs)
    for num in range(0, length, b_size):
        yield docs[num:min(num+b_size, length)]

ids_prev = 0
vectorstore = ''
full_ids = [str(i) for i in range(0, len(all_splits) + 1)]

for b in batch(all_splits, 160):
    ids = [str(i) for i in range(ids_prev, (len(b) + ids_prev), 1)]
    ids_prev += len(ids)
    vectorstore_initial = Chroma.from_documents(documents=b, embedding=HuggingFaceEmbeddings(),ids=ids)
    if not vectorstore:
        vectorstore = vectorstore_initial
    else:
        vec = vectorstore_initial._collection.get(ids=ids)
        vectorstore._collection.add(documents=vec['documents'], embeddings=vec['embeddings'], 
                                    metadatas=vec['metadatas'], ids=vec['ids'])

Insert of existing embedding ID: 160
Insert of existing embedding ID: 161
Insert of existing embedding ID: 162
Insert of existing embedding ID: 163
Insert of existing embedding ID: 164
Insert of existing embedding ID: 165
Insert of existing embedding ID: 166
Insert of existing embedding ID: 167
Insert of existing embedding ID: 168
Insert of existing embedding ID: 169
Insert of existing embedding ID: 170
Insert of existing embedding ID: 171
Insert of existing embedding ID: 172
Insert of existing embedding ID: 173
Insert of existing embedding ID: 174
Insert of existing embedding ID: 175
Insert of existing embedding ID: 176
Insert of existing embedding ID: 177
Insert of existing embedding ID: 178
Insert of existing embedding ID: 179
Insert of existing embedding ID: 180
Insert of existing embedding ID: 181
Insert of existing embedding ID: 182
Insert of existing embedding ID: 183
Insert of existing embedding ID: 184
Insert of existing embedding ID: 185
Insert of existing embedding ID: 186
I

In [7]:
to_cluster = {}

for id in full_ids:
    temp_vec = vectorstore._collection.get(ids=[full_ids[int(id)]],include=['embeddings'])
    to_cluster[id] = temp_vec['embeddings']

print(to_cluster.keys())


dict_keys(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156', 

In [8]:
def kmeans(data, k):
    np.random.seed(12345)
    rand_init = np.random.choice(len(data), k, replace=False)
    centroids = {}
    dists = {}
    same = True
    for i in range(k):
        centroids[i] = data[rand_init[i]]
    while same == True:
        for i in range(len(data)):
            cntr_arr = []
            for key in centroids:
                cntr_arr.append(math.dist(data[i],centroids[key]))
            dists[i] = cntr_arr
       
        groupID = {}
        for key, value in enumerate(dists):
            groups = {}
            grouping = dists[key].index(min(dists[key]))
            groups[key] = [data[key]]
            if groupID.get(grouping) is not None:
                groupID[grouping].append(groups)
            else:
                groupID[grouping] = [groups]
        oldcentr = centroids
        for i in range(k):
            centroids[i] = np.sum(np.vstack([list(groupID[i][j].values())[0][0] for j in range(len(groupID[i]))]), 
                                  axis=0)/len(groupID[i])
        if centroids == oldcentr:
            same = False
    return groups, groupID, centroids

In [9]:
values = to_cluster.values()
arrayData = list(values)

newray = []
for i in range(len(arrayData)-1):
    newray.append(arrayData[i][0])
array = np.array(list(newray))
k=10
np.random.seed(1234)
groups, groupID, centroids = kmeans(array, k)

In [11]:
rep_centroid = {}
for i in range(len(groupID)):
    for j in groupID[i]:
        emb_dist = {}
        emb_dist[list(j.items())[0][0]] = [math.dist(list(j.values())[0][0],centroids[i])]
        if rep_centroid.get(i) is not None:
            rep_centroid[i].append(emb_dist)
        else:
            rep_centroid[i] = [emb_dist]

In [12]:
mins = {}
for key, value in enumerate(rep_centroid):
    current_min = 999
    current_id = 0
    min_dict = {}
    for keyi, vali in enumerate(rep_centroid[key]):
        new_val = list(rep_centroid[key][keyi].values())[0][0]
        new_id = list(rep_centroid[key][keyi].items())[0][0]
        if new_val < current_min:
            current_min = new_val
            current_id = new_id
    min_dict[current_id] = current_min
    mins[key] = min_dict

print(mins)

{0: {133: 0.45277704980530126}, 1: {40: 0.5141776415263648}, 2: {78: 0.5933098983677147}, 3: {160: 0.5331631567225045}, 4: {51: 0.4995249203541028}, 5: {169: 0.6587031175758578}, 6: {73: 0.5842050937326108}, 7: {186: 0.0}, 8: {62: 0.5503525579043758}, 9: {89: 0.6165880000536104}}


In [13]:
cent_docs = []
for i in range(k):
    cent_docs.append(vectorstore._collection.get(ids=str(list(mins[i].items())[0][0]),
                                                 include=['documents'])['documents'][0])
print(cent_docs)

['A recurrent neural network (RNN) is a type of artificial neural network which uses sequential data or time series data. These deep learning algorithms are commonly used for ordinal or temporal problems, such as language translation, natural language processing (nlp), speech recognition, and image captioning; they are incorporated into popular applications such as Siri, voice search, and Google Translate. Like feedforward and convolutional neural networks (CNNs), recurrent neural networks utilize training data to learn. They are distinguished by their “memory” as they take information from prior inputs to influence the current input and output. While traditional deep neural networks assume that inputs and outputs are independent of each other, the output of recurrent neural networks depend on the prior elements within the sequence. While future events would also be helpful in determining the output of a given sequence, unidirectional recurrent neural networks cannot account for these'

In [14]:
tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")
model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws")

def get_paraphrased_sentences(model, tokenizer, sentence, num_return_sequences=10, num_beams=10):
    inputs = tokenizer([sentence], truncation=True, padding="longest", return_tensors="pt")
    outputs = model.generate(**inputs, num_beams=num_beams, num_return_sequences=num_return_sequences)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [15]:
for document in range(k):
    sentence = get_paraphrased_sentences(model, tokenizer, cent_docs[document], num_beams=10, num_return_sequences=10)
    print("Document" , document, "is paraphrased as: ", sentence)



Document 0 is paraphrased as:  ['A recurrent neural network (RNN) is a type of artificial neural network that', 'A recurrent neural network (RNN) is a type of artificial neural network which', 'Recurrent neural networks (RNNs) are a type of artificial neural network that uses', 'The recurrent neural network (RNN) is a type of artificial neural network that', 'A recurrent neural network (RNN) is a type of artificial neural network using', 'Recurrent neural networks (RNNs) are a type of artificial neural network which uses', 'Unlike feedforward and convolutional neural networks (CNNs), recurrent neural', 'Like feedforward and convolutional neural networks (CNNs), recurrent neural networks', 'A recurrent neural network (RNN) is a kind of artificial neural network that', 'A recurrent neural network (RNN) is a type of artificial neural network ']
Document 1 is paraphrased as:  ['With the growing ubiquity of machine learning, everyone in business is likely to encounter it', 'A 2020 Deloitte 

Document 9 is paraphrased as:  ['What are convolutional neural networks? | IBM Learn how convolutional neural networks use three', 'What are convolutional neural networks? | IBM What are convolutional neural networks Learn how', 'What are convolutional neural networks? | IBM What are convolutional neural networks? Learn', 'What are Convolutional Neural Networks? | IBM Learn how convolutional neural networks', 'Learn how convolutional neural networks use three-dimensional data for image classification and object recognition tasks', 'What are Convolutional Neural Networks? | IBM What are convolutional neural networks', 'What are Convolutional Neural Networks? | IBM What are Convolutional Neural', 'What are Convolutional Neural Networks? | IBM Learn how Convolutional Neural', '| IBM What are convolutional neural networks? Learn how convolutional neural networks use three', 'Learn how convolutional neural networks use three-dimensional data to perform image classification and object recogn