In [30]:
!pip install langchain
!pip install langchain_community
!pip install --upgrade pip
!pip install pypdfium2




In [31]:
from langchain.document_loaders import PyPDFium2Loader
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np 

In [32]:
!ls

Exploratory.ipynb  aws-overview.pdf


In [33]:
aws_overview_path = os.path.abspath("aws-overview.pdf")
pdf_loader = PyPDFium2Loader(aws_overview_path)

In [34]:
documents = pdf_loader.load()



In [35]:
import pandas as pd 
data = []
for doc in documents:
    page_number = doc.metadata['page']
    content = doc.page_content
    data.append({'page': page_number, 'content': content})

# Create a DataFrame from the extracted data
df = pd.DataFrame(data)


# Display the DataFrame
print(df)

    page                                            content
0      0  Overview of Amazon\r\nWeb Services\r\nAWS Whit...
1      1  Overview of Amazon Web Services AWS Whitepaper...
2      2  Overview of Amazon Web Services AWS Whitepaper...
3      3  Overview of Amazon Web Services AWS Whitepaper...
4      4  Overview of Amazon Web Services AWS Whitepaper...
..   ...                                                ...
80    80  Overview of Amazon Web Services AWS Whitepaper...
81    81  Overview of Amazon Web Services AWS Whitepaper...
82    82  Overview of Amazon Web Services AWS Whitepaper...
83    83  Overview of Amazon Web Services AWS Whitepaper...
84    84  Overview of Amazon Web Services AWS Whitepaper...

[85 rows x 2 columns]


In [36]:
count_vectorizer = CountVectorizer()
count_vect = count_vectorizer.fit_transform(df['content'])
count_vect.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 4, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [37]:
df

Unnamed: 0,page,content
0,0,Overview of Amazon\r\nWeb Services\r\nAWS Whit...
1,1,Overview of Amazon Web Services AWS Whitepaper...
2,2,Overview of Amazon Web Services AWS Whitepaper...
3,3,Overview of Amazon Web Services AWS Whitepaper...
4,4,Overview of Amazon Web Services AWS Whitepaper...
...,...,...
80,80,Overview of Amazon Web Services AWS Whitepaper...
81,81,Overview of Amazon Web Services AWS Whitepaper...
82,82,Overview of Amazon Web Services AWS Whitepaper...
83,83,Overview of Amazon Web Services AWS Whitepaper...


In [38]:
query = 'Tell me about Lambda Function'

In [39]:
query_vect = count_vectorizer.transform([query])
query_vect.toarray()

array([[0, 0, 0, ..., 0, 0, 0]])

In [40]:
query_array = np.dot(count_vect.toarray(), query_vect.toarray().T)
query_array

array([[0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [3],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [2],
       [1],
       [0],
       [0],
       [1],
       [2],
       [0],
       [2],
       [5],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [4],
       [2],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [4],
       [0],
       [0],
       [2],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [2],
       [1],
       [2],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
    

In [41]:

print(df.iloc[np.argmax(query_array)]['content'])

Overview of Amazon Web Services AWS Whitepaper
AWS Lambda
define networking and IAM policies, and launch the application. EC2 launch type allows you to have
server-level, more granular control over the infrastructure that runs your container applications. With
EC2 launch type, you can use Amazon ECS to manage a cluster of servers and schedule placement
of containers on the servers. Amazon ECS keeps track of all the CPU, memory and other resources in
your cluster, and also finds the best server for a container to run on based on your specified resource
requirements. You are responsible for provisioning, patching, and scaling clusters of servers. You can
decide which type of server to use, which applications and how many containers to run in a cluster
to optimize utilization, and when you should add or remove servers from a cluster. EC2 launch type
gives you more control of your server clusters and provides a broader range of customization options,
which might be required to support some

In [42]:
top_5_indices = np.argsort(query_array.flatten())[-5:][::-1]
print(top_5_indices)

# Get the content of the top 5 matches
top_5_contents = df.iloc[top_5_indices]['content']

# Print the content of the top 5 matches
for content in top_5_contents:
    print(content)
    print('-----------------------------------------------------------------------------------')

[29 41 55 10 42]
Overview of Amazon Web Services AWS Whitepaper
AWS Lambda
define networking and IAM policies, and launch the application. EC2 launch type allows you to have
server-level, more granular control over the infrastructure that runs your container applications. With
EC2 launch type, you can use Amazon ECS to manage a cluster of servers and schedule placement
of containers on the servers. Amazon ECS keeps track of all the CPU, memory and other resources in
your cluster, and also finds the best server for a container to run on based on your specified resource
requirements. You are responsible for provisioning, patching, and scaling clusters of servers. You can
decide which type of server to use, which applications and how many containers to run in a cluster
to optimize utilization, and when you should add or remove servers from a cluster. EC2 launch type
gives you more control of your server clusters and provides a broader range of customization options,
which might be require

In [43]:
print(df.content[1])

Overview of Amazon Web Services AWS Whitepaper
Overview of Amazon Web Services: AWS Whitepaper
Copyright © Amazon Web Services, Inc. and/or its affiliates. All rights reserved.
Amazon's trademarks and trade dress may not be used in connection with any product or service that is not
Amazon's, in any manner that is likely to cause confusion among customers, or in any manner that disparages or
discredits Amazon. All other trademarks not owned by Amazon are the property of their respective owners, who may
or may not be affiliated with, connected to, or sponsored by Amazon.



In [44]:
tfidf = TfidfVectorizer()
tfidf_vect = tfidf.fit_transform(df['content'])
tfidf_vect.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.22208629, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [45]:
query = 'Tell me about Lambda Function'
tfidf_query_vect = tfidf.transform([query])
tfidf_query_vect.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [46]:
cosine_similarity(tfidf_vect, tfidf_query_vect )


array([[0.        ],
       [0.        ],
       [0.        ],
       [0.01910293],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.04134141],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.0140011 ],
       [0.        ],
       [0.01148985],
       [0.        ],
       [0.03849663],
       [0.01482865],
       [0.        ],
       [0.        ],
       [0.01210639],
       [0.03008375],
       [0.        ],
       [0.02283762],
       [0.0678614 ],
       [0.01236697],
       [0.01145881],
       [0.        ],
       [0.        ],
       [0.01009645],
       [0.        ],
       [0.01245164],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.05214581],
       [0.01944737],
       [0.0103782 ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.009

In [47]:
tfidf_query_array = np.dot(tfidf_vect.toarray(), tfidf_query_vect.toarray().T)
tfidf_query_array

array([[0.        ],
       [0.        ],
       [0.        ],
       [0.01910293],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.04134141],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.0140011 ],
       [0.        ],
       [0.01148985],
       [0.        ],
       [0.03849663],
       [0.01482865],
       [0.        ],
       [0.        ],
       [0.01210639],
       [0.03008375],
       [0.        ],
       [0.02283762],
       [0.0678614 ],
       [0.01236697],
       [0.01145881],
       [0.        ],
       [0.        ],
       [0.01009645],
       [0.        ],
       [0.01245164],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.05214581],
       [0.01944737],
       [0.0103782 ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.009

In [48]:
top_5_indices = np.argsort(tfidf_query_array.flatten())[-5:][::-1]
top_5_indices

array([29, 55, 41, 68, 10])

In [49]:

# Get the content of the top 5 matches
top_5_contents = df.iloc[top_5_indices]['content']

# Print the content of the top 5 matches
for content in top_5_contents:
    print(content)
    print('-----------------------------------------------------------------------------------')

Overview of Amazon Web Services AWS Whitepaper
AWS Lambda
define networking and IAM policies, and launch the application. EC2 launch type allows you to have
server-level, more granular control over the infrastructure that runs your container applications. With
EC2 launch type, you can use Amazon ECS to manage a cluster of servers and schedule placement
of containers on the servers. Amazon ECS keeps track of all the CPU, memory and other resources in
your cluster, and also finds the best server for a container to run on based on your specified resource
requirements. You are responsible for provisioning, patching, and scaling clusters of servers. You can
decide which type of server to use, which applications and how many containers to run in a cluster
to optimize utilization, and when you should add or remove servers from a cluster. EC2 launch type
gives you more control of your server clusters and provides a broader range of customization options,
which might be required to support some

In [50]:
np.sort(tfidf_query_vect.toarray().flatten())[-5:][::-1]

array([0.69665645, 0.53549293, 0.47740666, 0.        , 0.        ])

In [51]:
!pip install transformers



In [52]:
import torch
from transformers import BertTokenizer, BertModel

In [53]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



model = BertModel.from_pretrained('bert-base-uncased')
input_text = df['content'].tolist() 
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)



In [54]:
with torch.no_grad():
    outputs = model(**inputs)

# The last hidden states
document_last_hidden_states = outputs.last_hidden_state

# Print the shape of the embeddings
print("Shape of the embeddings:", document_last_hidden_states.shape)