In [3]:
from config import *
from dotenv import load_dotenv
import os
import openai

load_dotenv()

openai.api_key = os.environ["OPENAI_API_KEY"]

In [14]:
from pymongo import MongoClient

# initialize MongoDB python client
client = MongoClient(MONGODB_CLUSTER)
MONGODB_COLLECTION = client[DATABASE_NAME][COLLECTION_NAME]
ATLAS_VECTOR_SEARCH_INDEX_NAME = "EmbeddingText"

In [15]:
MONGODB_COLLECTION.delete_many({})

DeleteResult({'n': 7206, 'electionId': ObjectId('7fffffff000000000000004b'), 'opTime': {'ts': Timestamp(1701728314, 6303), 't': 75}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1701728314, 6303), 'signature': {'hash': b' \x98^\x1c\x81N\t\x19\xcf\xb9\x86\xa4\xbd\xd1\x9c6\xaffu\xcb', 'keyId': 7272924574814044162}}, 'operationTime': Timestamp(1701728314, 6303)}, acknowledged=True)

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd
from langchain.schema import Document


def get_docs(movie_name, if_split=True):
    df = pd.read_csv(f"movie_reviews_link\{movie_name}.csv")
    docs = []
    for i in df.iterrows():
        row = i[1]
        docs.append(
            Document(
                page_content=row["review_title"] + " " + row["review_comment"],
                metadata={
                    "date": row["review_date"],
                    "title": row["review_title"],
                    "rating": row["review_rating"],
                    "helpful": row["review_helpful"],
                    "total_votes": row["review_total_votes"],
                    "if_spoiler": row["reviews_if_spoiler"],
                    "link": row["review_link"],
                    "MovieName": movie_name,
                    "source": "imdb",
                },
            )
        )

    with open("wikipedia_data\Whiplash_2014.txt", "r") as f:
        data = f.read()
        docs.append(Document(page_content=data, metadata={"MovieName": movie_name}))
    if if_split:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, chunk_overlap=64
        )
        docs = text_splitter.split_documents(docs)
    return docs

In [4]:
review_docs = get_docs("Parasite_2019")

In [5]:
len(review_docs)

4875

In [5]:
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import MongoDBAtlasVectorSearch
from config import *
vector_search = MongoDBAtlasVectorSearch.from_documents(
    documents=review_docs,
    embedding=SentenceTransformerEmbeddings(model_kwargs={"device":"cuda"}),
    collection=MONGODB_COLLECTION,
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
)

NameError: name 'review_docs' is not defined

In [6]:
from langchain.vectorstores import MongoDBAtlasVectorSearch
from langchain.embeddings import SentenceTransformerEmbeddings

vector_search = MongoDBAtlasVectorSearch.from_connection_string(
    MONGODB_CLUSTER,
    DATABASE_NAME + "." + VECTORDB_COLLECTION_NAME,
    SentenceTransformerEmbeddings(model_kwargs={"device": "cuda"}),
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
)

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
query = "Explain the plot of green lantern to me like I am 5"
results = vector_search.similarity_search_with_score(
    query=query, k=2
)

# Display results
for result in results:
    print(result)

(Document(page_content="Why? How? I can't understand I hate this movie There is no story What is the point How this movie got an Oscar Am I that stupid that I can t understand the art behind this movie So many questions", metadata={'_id': ObjectId('656831f1cf59d6cffdc97637'), 'embedding': [0.03356818109750748, 0.0005147983902134001, 0.010374377481639385, 0.06198616698384285, -0.02816276252269745, 0.021207759156823158, 0.009218711405992508, -0.011886104941368103, -0.039038192480802536, 0.04388611018657684, -0.010551177896559238, 0.04032014310359955, 0.0634767934679985, 0.041400693356990814, -0.0291011743247509, -0.0071119568310678005, -0.025541935116052628, 0.08161246031522751, 0.007244126871228218, 0.01567898504436016, -0.0009605860686860979, -0.027796626091003418, 0.03232026845216751, 0.014966514892876148, -0.015562349930405617, 0.008519561029970646, 0.04080604761838913, 0.013178068213164806, -0.03203069418668747, -0.04295555502176285, -0.038296449929475784, -0.02644212916493416, 0.00

In [16]:
{
    "mappings": {
        "dynamic": true,
        "fields": {
            "MovieName": {"type": "token"},
            "source": {"type": "token"},
            "embedding": {
                "type": "knnVector",
                "dimensions": 768,
                "similarity": "euclidean",
            },
        },
    }
}

In [9]:
qa_retriever = vector_search.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 100, "post_filter_pipeline": [{"$limit": 5}]},
)

from langchain.prompts import PromptTemplate

prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [21]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

qa = RetrievalQA.from_chain_type(
    llm=OpenAI(),
    chain_type="stuff",
    retriever=qa_retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT},
)

docs = qa({"query": "Explain the plot of Green Lantern"})

print(docs["result"])
print(docs["source_documents"])


I don't know.
[Document(page_content='going to say more about the plot story than that because i don t want to give anything away and because i think you should go into this film without knowing anything This film is simply a must watch No wonder it won the Palme d Or', metadata={'_id': ObjectId('656831c9cf59d6cffdc970e1'), 'embedding': [0.03120255470275879, -0.06406696140766144, -0.0037638491485267878, 0.05419664457440376, -0.0483304038643837, 0.03384523466229439, -0.05345743149518967, 0.0019191739847883582, -0.045476872473955154, 0.026543132960796356, -0.0023370161652565002, 0.016702936962246895, 0.04209548607468605, 0.012120951898396015, 0.013552393764257431, -0.02452046610414982, -0.01540196780115366, 0.027245771139860153, -0.04616304114460945, -0.011028368957340717, 0.00851220078766346, -0.03535556420683861, 0.0037011587992310524, 0.028386034071445465, 0.07711846381425858, -0.008561440743505955, 0.0001306592603214085, 0.078856460750103, -0.04066048935055733, -0.04525498300790787,

In [11]:
docs

{'query': 'What are the superpowers of Green lantern',
 'result': "\nAnswer: I don't know.",
 'source_documents': [Document(page_content="This film didn't impress me. This film didn t impress me But Joker did and i weep at theater and felt catharsis", metadata={'_id': ObjectId('65683223cf59d6cffdc97de0'), 'embedding': [-0.02155100367963314, 0.07928542047739029, -0.008688509464263916, -0.016829518601298332, -0.016131287440657616, 0.002949473215267062, -0.0003036327543668449, 0.0033741346560418606, 0.025467978790402412, 0.017534220591187477, -0.0954195037484169, 0.05364636331796646, 0.07240507751703262, -0.023108292371034622, 0.039595115929841995, -0.015178600326180458, 0.018114330247044563, 0.051633574068546295, -0.004604949615895748, 0.013116642832756042, 0.024175385013222694, -0.02122589386999607, 0.006270867772400379, 0.005858850199729204, 0.010644274763762951, -0.03395020589232445, 0.06927206367254257, -0.004046567250043154, -0.03029928170144558, -0.03154745697975159, 0.006780347786

In [20]:
for doc in docs['source_documents']:
    print(doc.metadata['link'])

https://www.imdb.com/review/rw5448894/?ref_=tt_urv
https://www.imdb.com/review/rw6577541/?ref_=tt_urv
https://www.imdb.com/review/rw5469533/?ref_=tt_urv
https://www.imdb.com/review/rw5182505/?ref_=tt_urv
https://www.imdb.com/review/rw5491796/?ref_=tt_urv


In [None]:
links = ['https://www.imdb.com/review/rw8350979/?ref_=tt_urv', 'https://www.imdb.com/review/rw5808551/?ref_=tt_urv', 'https://www.imdb.com/review/rw5513190/?ref_=tt_urv', 'https://www.imdb.com/review/rw5476063/?ref_=tt_urv', 'https://www.imdb.com/review/rw8674413/?ref_=tt_urv']

