In [4]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import AsyncChromiumLoader
from langchain.document_transformers import Html2TextTransformer
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import unicodedata
import lxml_html_clean
from bs4 import BeautifulSoup
import nest_asyncio
import pandas as pd

In [5]:
stack_exchange = pd.read_csv("../model/python_qna.csv")

In [18]:
stack_exchange.loc[49996]['Title']

'Tensorflow: Stringify tensor as a whole (without creating tensor of strings)'

In [7]:
stack_exchange.loc[49996]['question']

'<p>I have the following array,</p>\n\n<pre class="lang-py prettyprint-override"><code>a = tf.random.uniform((5,2), 0, 10)\n\n&lt;tf.Tensor: shape=(5, 2), dtype=float32, numpy=\narray([[3.8656425 , 6.7514324 ],\n       [0.49138665, 3.5968459 ],\n       [4.435692  , 4.7223845 ],\n       [7.3588967 , 0.31867146],\n       [1.6837907 , 3.2266355 ]], dtype=float32)&gt;\n</code></pre>\n\n<p>What I want is array of stringified arrays like below this would return a numpy array, but I want to do tensorflow ops to return a tensor:</p>\n\n<pre class="lang-py prettyprint-override"><code>list(map(str, a.numpy()))\n\n[\'[3.8656425 6.7514324]\',\n \'[0.49138665 3.5968459 ]\',\n \'[4.435692  4.7223845]\',\n \'[7.3588967  0.31867146]\',\n \'[1.6837907 3.2266355]\']\n</code></pre>\n\n<p>When I use <code>tf.as_string()</code></p>\n\n<pre class="lang-py prettyprint-override"><code>tf.as_string(a)\n\n&lt;tf.Tensor: shape=(5, 2), dtype=string, numpy=\narray([[b\'3.865643\', b\'6.751432\'],\n       [b\'0.491

In [8]:
stack_exchange.loc[49996]['answer']

'<p>You can use <a href="https://www.tensorflow.org/api_docs/python/tf/strings/format" rel="nofollow noreferrer"><code>tf.strings.format</code></a>:</p>\n\n<pre class="lang-py prettyprint-override"><code>import tensorflow as tf\n\ntf.random.set_seed(0)\na = tf.random.uniform((5,2), 0, 10)\nb = tf.map_fn(lambda r: tf.strings.format(\'{}\', r, summarize=-1), a, tf.string)\nprint(b)\n# tf.Tensor(\n# [b\'[2.91975141 2.06566453]\' b\'[5.35390759 5.61257458]\'\n#  b\'[4.16674519 8.0782795]\' b\'[4.93225098 9.98129272]\'\n#  b\'[6.96735144 1.25373602]\'], shape=(5,), dtype=string)\n</code></pre>\n'

In [9]:
soup = BeautifulSoup(stack_exchange.loc[49996]['question'], 'html.parser')

In [15]:
' '.join(str(each_find) for each_find in soup.findAll(['p', 'code', 'b']))

"<p>I have the following array,</p> <code>a = tf.random.uniform((5,2), 0, 10)\n\n&lt;tf.Tensor: shape=(5, 2), dtype=float32, numpy=\narray([[3.8656425 , 6.7514324 ],\n       [0.49138665, 3.5968459 ],\n       [4.435692  , 4.7223845 ],\n       [7.3588967 , 0.31867146],\n       [1.6837907 , 3.2266355 ]], dtype=float32)&gt;\n</code> <p>What I want is array of stringified arrays like below this would return a numpy array, but I want to do tensorflow ops to return a tensor:</p> <code>list(map(str, a.numpy()))\n\n['[3.8656425 6.7514324]',\n '[0.49138665 3.5968459 ]',\n '[4.435692  4.7223845]',\n '[7.3588967  0.31867146]',\n '[1.6837907 3.2266355]']\n</code> <p>When I use <code>tf.as_string()</code></p> <code>tf.as_string()</code> <code>tf.as_string(a)\n\n&lt;tf.Tensor: shape=(5, 2), dtype=string, numpy=\narray([[b'3.865643', b'6.751432'],\n       [b'0.491387', b'3.596846'],\n       [b'4.435692', b'4.722384'],\n       [b'7.358897', b'0.318671'],\n       [b'1.683791', b'3.226635']], dtype=objec

In [39]:
' '.join(soup.stripped_strings)

"I have the following array, a = tf.random.uniform((5,2), 0, 10)\n\n<tf.Tensor: shape=(5, 2), dtype=float32, numpy=\narray([[3.8656425 , 6.7514324 ],\n       [0.49138665, 3.5968459 ],\n       [4.435692  , 4.7223845 ],\n       [7.3588967 , 0.31867146],\n       [1.6837907 , 3.2266355 ]], dtype=float32)> What I want is array of stringified arrays like below this would return a numpy array, but I want to do tensorflow ops to return a tensor: list(map(str, a.numpy()))\n\n['[3.8656425 6.7514324]',\n '[0.49138665 3.5968459 ]',\n '[4.435692  4.7223845]',\n '[7.3588967  0.31867146]',\n '[1.6837907 3.2266355]'] When I use tf.as_string() tf.as_string(a)\n\n<tf.Tensor: shape=(5, 2), dtype=string, numpy=\narray([[b'3.865643', b'6.751432'],\n       [b'0.491387', b'3.596846'],\n       [b'4.435692', b'4.722384'],\n       [b'7.358897', b'0.318671'],\n       [b'1.683791', b'3.226635']], dtype=object)> I also tried using tf.map_fn(tf.as_string, a, dtype=tf.string)\n\n# Same output as above tf.as_string()

In [138]:
soup(['p', 'pre'])

<p>I have the following array,</p>
<hh class="lang-py prettyprint-override"><code>a = tf.random.uniform((5,2), 0, 10)

&lt;tf.Tensor: shape=(5, 2), dtype=float32, numpy=
array([[3.8656425 , 6.7514324 ],
       [0.49138665, 3.5968459 ],
       [4.435692  , 4.7223845 ],
       [7.3588967 , 0.31867146],
       [1.6837907 , 3.2266355 ]], dtype=float32)&gt;
</code></hh>
<p>What I want is array of stringified arrays like below this would return a numpy array, but I want to do tensorflow ops to return a tensor:</p>
<hhh class="lang-py prettyprint-override"><code>list(map(str, a.numpy()))

['[3.8656425 6.7514324]',
 '[0.49138665 3.5968459 ]',
 '[4.435692  4.7223845]',
 '[7.3588967  0.31867146]',
 '[1.6837907 3.2266355]']
</code></hhh>
<p>When I use <code>tf.as_string()</code></p>
<hhh class="lang-py prettyprint-override"><code>tf.as_string(a)

&lt;tf.Tensor: shape=(5, 2), dtype=string, numpy=
array([[b'3.865643', b'6.751432'],
       [b'0.491387', b'3.596846'],
       [b'4.435692', b'4.722384'

In [113]:
soup

<p>I have the following array,</p>
<pre class="lang-py prettyprint-override"><code>a = tf.random.uniform((5,2), 0, 10)

&lt;tf.Tensor: shape=(5, 2), dtype=float32, numpy=
array([[3.8656425 , 6.7514324 ],
       [0.49138665, 3.5968459 ],
       [4.435692  , 4.7223845 ],
       [7.3588967 , 0.31867146],
       [1.6837907 , 3.2266355 ]], dtype=float32)&gt;
</code></pre>
<p>What I want is array of stringified arrays like below this would return a numpy array, but I want to do tensorflow ops to return a tensor:</p>
<pre class="lang-py prettyprint-override"><code>list(map(str, a.numpy()))

['[3.8656425 6.7514324]',
 '[0.49138665 3.5968459 ]',
 '[4.435692  4.7223845]',
 '[7.3588967  0.31867146]',
 '[1.6837907 3.2266355]']
</code></pre>
<p>When I use <code>tf.as_string()</code></p>
<pre class="lang-py prettyprint-override"><code>tf.as_string(a)

&lt;tf.Tensor: shape=(5, 2), dtype=string, numpy=
array([[b'3.865643', b'6.751432'],
       [b'0.491387', b'3.596846'],
       [b'4.435692', b'4.72238

### Miscellaneous

In [89]:
import re
nodes_to_censor = soup.findAll(text=re.compile('.*.*'))
print(nodes_to_censor)
for node in nodes_to_censor:
    node.replaceWith('###')

['a = tf.random.uniform((5,2), 0, 10)\n\n<tf.Tensor: shape=(5, 2), dtype=float32, numpy=\narray([[3.8656425 , 6.7514324 ],\n       [0.49138665, 3.5968459 ],\n       [4.435692  , 4.7223845 ],\n       [7.3588967 , 0.31867146],\n       [1.6837907 , 3.2266355 ]], dtype=float32)>\n', 'What I want is array of stringified arrays like below this would return a numpy array, but I want to do tensorflow ops to return a tensor:', "tf.as_string(a)\n\n<tf.Tensor: shape=(5, 2), dtype=string, numpy=\narray([[b'3.865643', b'6.751432'],\n       [b'0.491387', b'3.596846'],\n       [b'4.435692', b'4.722384'],\n       [b'7.358897', b'0.318671'],\n       [b'1.683791', b'3.226635']], dtype=object)>\n"]


  nodes_to_censor = soup.findAll(text=re.compile('.*array.*'))


In [76]:
nodes_to_censor

[]

### Articles to documents

In [21]:
nest_asyncio.apply()

In [22]:
articles = ["https://docs.python.org/3.13/whatsnew/3.13.html"]

In [23]:
# Scrapes the blogs above
loader = AsyncChromiumLoader(articles)
docs = loader.load()

In [25]:
# Converts HTML to plain text 
html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)

In [31]:
# Chunk text
text_splitter = CharacterTextSplitter(chunk_size=500, 
                                      chunk_overlap=0)
chunked_documents = text_splitter.split_documents(docs_transformed)

Created a chunk of size 1716, which is longer than the specified 500
Created a chunk of size 540, which is longer than the specified 500
Created a chunk of size 708, which is longer than the specified 500
Created a chunk of size 662, which is longer than the specified 500
Created a chunk of size 515, which is longer than the specified 500
Created a chunk of size 1716, which is longer than the specified 500


In [12]:
# Load chunked documents into the FAISS index
db = FAISS.from_documents(chunked_documents, 
                          HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2'))


In [10]:
db.search("Python", search_type='mmr')

 Document(page_content='### tkinter¶', metadata={'source': 'https://docs.python.org/3.13/whatsnew/3.13.html'}),
 Document(page_content='* index\n  * modules |\n  * next |\n  * previous |\n  *   * Python »\n  * EnglishSpanishFrenchJapaneseKoreanBrazilian PortugueseTurkishSimplified ChineseTraditional Chinese', metadata={'source': 'https://docs.python.org/3.13/whatsnew/3.13.html'}),
 Document(page_content='* Python 3.13 and later have two years of full support, followed by three years of security fixes.', metadata={'source': 'https://docs.python.org/3.13/whatsnew/3.13.html'})]