In [1]:
import os
import getpass

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake

embeddings = OpenAIEmbeddings()



In [2]:
import os
from langchain.document_loaders import TextLoader

root_dir = "./the-algorithm"
docs = []
for dirpath, dirnames, filenames in os.walk(root_dir):
    for file in filenames:
        try:
            loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
            docs.extend(loader.load_and_split())
        except Exception as e:
            pass

In [3]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts=text_splitter.split_documents(docs)

Created a chunk of size 2549, which is longer than the specified 1000
Created a chunk of size 2095, which is longer than the specified 1000
Created a chunk of size 1983, which is longer than the specified 1000
Created a chunk of size 1020, which is longer than the specified 1000
Created a chunk of size 1540, which is longer than the specified 1000
Created a chunk of size 1245, which is longer than the specified 1000
Created a chunk of size 1257, which is longer than the specified 1000
Created a chunk of size 2273, which is longer than the specified 1000
Created a chunk of size 1411, which is longer than the specified 1000
Created a chunk of size 1263, which is longer than the specified 1000
Created a chunk of size 1672, which is longer than the specified 1000
Created a chunk of size 1794, which is longer than the specified 1000
Created a chunk of size 1034, which is longer than the specified 1000
Created a chunk of size 1201, which is longer than the specified 1000
Created a chunk of s

In [4]:
username = "veaceslavcalestru"
db = DeepLake(dataset_path=f"hub://{username}/twitter-algorithm", embedding_function=embeddings)
db.add_documents(texts)

Your Deep Lake dataset has been successfully created!
The dataset is private so make sure you are logged in!


 

Batch upload: 31314 samples are being uploaded in 32 batches of batch size 1000


Evaluating ingest: 100%|██████████| 32/32 [08:44<00:00
 

Dataset(path='hub://veaceslavcalestru/twitter-algorithm', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype        shape       dtype  compression
  -------    -------      -------     -------  ------- 
 embedding  embedding  (31314, 1536)  float32   None   
    id        text      (31314, 1)      str     None   
 metadata     json      (31314, 1)      str     None   
   text       text      (31314, 1)      str     None   


['617071fa-6ff3-11ee-b60b-cc4740c98b6b',
 '617071fb-6ff3-11ee-98a7-cc4740c98b6b',
 '617071fc-6ff3-11ee-b082-cc4740c98b6b',
 '617071fd-6ff3-11ee-b8f9-cc4740c98b6b',
 '617071fe-6ff3-11ee-9aea-cc4740c98b6b',
 '617071ff-6ff3-11ee-8297-cc4740c98b6b',
 '61707200-6ff3-11ee-9092-cc4740c98b6b',
 '61707201-6ff3-11ee-bdca-cc4740c98b6b',
 '61707202-6ff3-11ee-a0b8-cc4740c98b6b',
 '61707203-6ff3-11ee-9f19-cc4740c98b6b',
 '61707204-6ff3-11ee-99ea-cc4740c98b6b',
 '61707205-6ff3-11ee-bac4-cc4740c98b6b',
 '61707206-6ff3-11ee-b229-cc4740c98b6b',
 '61707207-6ff3-11ee-b51c-cc4740c98b6b',
 '61707208-6ff3-11ee-9369-cc4740c98b6b',
 '61707209-6ff3-11ee-bdf0-cc4740c98b6b',
 '6170720a-6ff3-11ee-9e80-cc4740c98b6b',
 '6170720b-6ff3-11ee-96ea-cc4740c98b6b',
 '6170720c-6ff3-11ee-a556-cc4740c98b6b',
 '6170720d-6ff3-11ee-b910-cc4740c98b6b',
 '6170720e-6ff3-11ee-ba1e-cc4740c98b6b',
 '6170720f-6ff3-11ee-8dfa-cc4740c98b6b',
 '61707210-6ff3-11ee-a04c-cc4740c98b6b',
 '61707211-6ff3-11ee-88e9-cc4740c98b6b',
 '61707212-6ff3-

In [5]:
# db = DeepLake(dataset_path=f"hub://veaceslavcalestru/twitter-algorithm", read_only=True, embedding_function=embeddings)

retriever = db.as_retriever()
retriever.search_kwargs['distance_metric'] = 'cos'
retriever.search_kwargs['fetch_k'] = 100
retriever.search_kwargs['maximal_marginal_relevance'] = True
retriever.search_kwargs['k'] = 10

def filter(x):
    if 'com.google' in x['text'].data()['value']:
        return False
    metadata = x['metadata'].data()['value']
    return 'scala' in metadata['source'] or 'py' in metadata['source']


from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain

model = ChatOpenAI(model='gpt-3.5-turbo')
qa = ConversationalRetrievalChain.from_llm(model, retriever=retriever)

In [6]:
questions = [
    "What does favCountParams do?",
    "is it Likes + Bookmarks, or not clear from the code?",
    "What are the major negative modifiers that lower your linear ranking parameters?",   
    "How do you get assigned to SimClusters?",
    "What is needed to migrate from one SimClusters to another SimClusters?",
    "How much do I get boosted within my cluster?",   
    "How does Heavy ranker work. what are it’s main inputs?",
    "How can one influence Heavy ranker?",
    "why threads and long tweets do so well on the platform?",
    "Are thread and long tweet creators building a following that reacts to only threads?",
    "Do you need to follow different strategies to get most followers vs to get most likes and bookmarks per tweet?",
    "Content meta data and how it impacts virality (e.g. ALT in images).",
    "What are some unexpected fingerprints for spam factors?",
    "Is there any difference between company verified checkmarks and blue verified individual checkmarks?",
]

chat_history = []

for question in questions:
    result = qa({'question': question, 'chat_history': chat_history})
    chat_history.append((question, result['answer']))
    print(f"-> **Question**:{question}\n")
    print(f"**Answer: {result['answer']}\n")

-> **Question**:What does favCountParams do?

**Answer: The `favCountParams` is an instance of `ThriftLinearFeatureRankingParams` and represents a parameter related to the ranking of tweets based on the number of favorites (likes) they have received. It is used in the context of ranking tweets based on their popularity or engagement level, with higher values indicating a higher weight or importance given to the number of favorites a tweet has.

-> **Question**:is it Likes + Bookmarks, or not clear from the code?

**Answer: No, it is not clear from the given code whether `favCountParams` includes both Likes and Bookmarks.

-> **Question**:What are the major negative modifiers that lower your linear ranking parameters?

**Answer: Based on the given code snippets, it is not possible to determine which modifiers have a major negative impact on the linear ranking parameters. The code snippets only provide information about the parameters and their weights, but do not indicate whether they h