In [1]:
pip install PyPDF2 gensim nltk


Note: you may need to restart the kernel to use updated packages.


In [2]:
import PyPDF2

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

document_text = extract_text_from_pdf('ML-book.pdf')



In [3]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Download the punkt tokenizer models
nltk.download('punkt')

# Split the text into sentences
sentences = sent_tokenize(document_text)

# Tokenize and preprocess each sentence
def preprocess_sentence(sentence):
    tokens = word_tokenize(sentence.lower())
    # Here you can add more preprocessing steps if required, like removing stopwords or non-alphabetic words
    return tokens

tagged_data = [TaggedDocument(words=preprocess_sentence(sentence), tags=[str(i)]) for i, sentence in enumerate(sentences)]

# Train a Doc2Vec model
model = Doc2Vec(vector_size=50, window=5, min_count=5, workers=4, epochs=100)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

# Save the model
model.save("jesc102_model.d2v")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nikitayeole/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# Load the model
model = Doc2Vec.load("jesc102_model.d2v")

# Infer a vector for a new sentence
new_sentence = "The earth revolves around the sun."
vector = model.infer_vector(preprocess_sentence(new_sentence))
print(vector)


[-0.21775185  0.04252978 -0.05865375  0.61915874 -0.24215174 -0.20079388
  0.07549016  0.4854519  -0.3241158   0.13255525 -0.574695   -0.17105877
 -0.40647918 -0.12864302 -0.00905236  0.26526415 -0.56191874 -0.627322
 -0.99024874 -0.6290462   0.26334715  0.2387938  -0.0965075   0.13367522
 -0.0012881   0.11666347  0.29086095  0.21948627  0.04094669  0.37087473
 -0.05253538  0.15453085  0.01194209 -0.05884088 -0.0992909   0.36020923
  0.05413212  0.19138251 -0.42592922  0.14088155  0.2218979  -0.09360335
  0.0802055  -0.05129237 -0.19307162  0.42378357 -0.62824994 -0.11919903
  0.44335163 -0.14665332]


In [5]:
tagged_data[1]

TaggedDocument(words=['i', 'also', 'welcome', 'my', 'son', ',', 'j.', 'david', 'creswell', ',', 'a', 'noted', 'psychologist', 'and', 'researcher', 'at', 'carnegie', 'mellon', 'university', ',', 'as', 'my', 'coauthor', '.'], tags=['1'])

In [6]:
model.docvecs[1]

  model.docvecs[1]


array([ 0.28698882, -0.40878928, -0.10916258,  1.4225255 ,  0.13852368,
       -1.5084041 ,  0.3904426 ,  0.59439737,  0.00482559,  1.6624931 ,
        0.20189711,  0.5350444 ,  0.70809275,  0.21089932, -0.02727226,
       -0.23636472,  0.13528219, -1.023095  , -0.13400681, -0.7091109 ,
        0.7349847 ,  0.4534731 , -0.1985152 , -0.80666906,  0.32154286,
        0.05558288, -0.44731942,  0.02076628, -0.1287201 ,  0.57298046,
       -0.30395526,  1.0621588 ,  0.6587195 , -0.874395  , -1.470047  ,
        0.84920543,  0.3569406 , -0.41665828, -0.29100752, -0.62096095,
        0.06721474,  0.9669889 , -0.12607981, -0.27324927,  0.66567045,
       -0.00180536, -0.55153114,  1.2710205 ,  1.2007949 ,  0.13485466],
      dtype=float32)

In [7]:
for i in range(len(sentences)):
    vector = model.docvecs[i]
    print(sentences[i])
    print(vector)
print(len(sentences))

  vector = model.docvecs[i]


Research	Design
Fifth	EditionI	dedicate	this	book	to	all	of	my	mentees	and	former	students	over	the	years	who	have	engaged
in	this	fascinating	process	of	research	and	who	have	welcomed	my	suggestions	for	improving
their	scholarly	works.
[-0.41029438 -0.46655113 -0.75866413  1.1760103  -1.565879   -0.5603224
  1.1554966   1.314038   -0.92353314  0.5928732  -0.91965413 -0.82603216
 -1.1775365  -0.7450457  -1.1504111  -0.35120788  0.7934237  -0.62151396
 -0.6121477  -1.2266762   0.32389444  1.131825    0.98831886  0.24252467
 -0.6063424  -0.9391208  -0.83591783 -0.42043477 -0.51217496  0.72805285
  1.2506047   0.71281505 -0.44001484  0.25819612  0.35581514  1.0671871
 -0.04734048 -0.67963684  0.98860526 -0.6741486   0.20834921  0.25562838
  0.42065984 -1.0504658   1.6493349  -0.7130209  -0.29052597 -0.8622337
  0.9327328  -0.644649  ]
I	also	welcome	my	son,	J.	David	Creswell,	a	noted	psychologist	and
researcher	at	Carnegie	Mellon	University,	as	my	coauthor.
[ 0.28698882 -0.40878928 -0.109

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [8]:
print(len(sentences))

5732


In [9]:
!pip install supabase-py



In [10]:
!pip install httpx




In [None]:
import httpx
import json

# Supabase setup
SUPABASE_URL = "https://mufqacshyjgmzivznuwo.supabase.co/"
SUPABASE_SERVICE_API_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6Im11ZnFhY3NoeWpnbXppdnpudXdvIiwicm9sZSI6ImFub24iLCJpYXQiOjE2OTg0MzM2MTQsImV4cCI6MjAxNDAwOTYxNH0.iYqEsEQJWcQUu7KjYWBzrRKdZn23Vp_xlhkHomc85Tc"

headers = {
    "apikey": SUPABASE_SERVICE_API_KEY,
    "Content-Type": "application/json"
}

def insert_vector_to_supabase(sentence, vector_data):
    endpoint = f"{SUPABASE_URL}/rest/v1/test_table"
    
    # Convert your vector to a JSON string.
    data_to_insert = {
        "sentence": sentence, 
        "vector_data": json.dumps(vector_data)  # assuming the vector_data is a numpy array
    }
    
    response = httpx.post(endpoint, headers=headers, json=data_to_insert)
    return response.json()

# Storing vectors
#for i in range(len(sentences)):
    vector = model.docvecs[1].tolist()  # Convert to list assuming it's a numpy array
    sentence = sentences[1]
    
    response = insert_vector_to_supabase(sentence, vector)
    print(f"Stored vector for '{sentence}': {response}")


In [None]:
def fetch_vectors_from_supabase():
    endpoint = f"https://mufqacshyjgmzivznuwo.supabase.co/rest/v1/test_table"
    response = httpx.get(endpoint, headers=headers)
    
    if response.status_code == 200:
        return response.json()
    else:
        return None

# Fetching and printing stored data
stored_data = fetch_vectors_from_supabase()

if stored_data:
    for item in stored_data:
        print(f"Sentence: {item['sentence']}")
        print(f"Vector: {json.loads(item['vector_data'])}")
else:
    print("Failed to fetch data from Supabase.")
