This notebook was created to quickly test the prompts and predictions of the model without fetching data multiple times.

In [205]:
import os
from huggingface_hub import InferenceClient

from dotenv import load_dotenv
from constants import prompt_template
import concurrent.futures
from supabase import create_client, Client
from datetime import datetime, timedelta
from huggingface_hub import InferenceClient
import instaloader
from itertools import dropwhile, takewhile
import re
import json

In [186]:
load_dotenv()

True

In [187]:
ACCOUNTS_TABLE="club_accounts"
DATA_TABLE="test"

MODEL="meta-llama/Meta-Llama-3-8B-Instruct"

In [188]:
class Database:
    def __init__(self):
        self.db: Client = create_client(os.getenv("SUPABASE_URL"), os.getenv("SUPABASE_KEY"))
    
        """
        TO DO! As of right now this function is useless as its functionality 
        already exists within the Supabase Python client 
        """
    def getData(self, table_name, column):      
        response = self.db.table(table_name).select("*").execute()
        data = [d[column] for d in response.data]
        return data

        """
        This function both format and inserts data data to the table table_name. 
        data is expected to be in json format enclosed within a list.
        """
    def insertData(self, table_name, data):
        try:
            response = self.db.table(table_name).insert(data).execute()
            return response
        except Exception as exception:
            return exception
    
        """
        This function deletes all rows from the specified database whose date 
        attribute is before the specified date date
        """
    def purgeData(self, table_name, date):
        response = self.db.table(table_name).delete().lt("date", date).execute()
        try:
            return response
        except Exception as exception:
            return exception

In [189]:
class Inference:
    def __init__(self, model, token):
        self.model = client = InferenceClient(model, token=token)

    def predict_post(self, post):
        prompt = prompt_template.format(account=post['account'], 
                                        date=post['date'], 
                                        caption=post['caption'])
        response = self.model.text_generation(prompt=prompt)
        return response

In [192]:
def fetchData(accounts, StartDate, EndDate, L):
    data = []
  
    for account in accounts:
        posts = instaloader.Profile.from_username(L.context, account).get_posts()
        
        filter_after_since = lambda p: p.date > EndDate
        filter_until = lambda p: p.date > StartDate
        
        filtered_posts = takewhile(filter_until, dropwhile(filter_after_since, posts))
        
        for post in filtered_posts:
            data.append({"account": account, 
                         "date": post.date.strftime('%Y/%m/%d'), 
                         "caption": post.caption})
    
    return data

In [193]:
from scraper import fetchData
from database import Database
from inference import Inference

In [None]:
L = instaloader.Instaloader()
L.load_session_from_file(os.getenv("INSTAGRAM_USER"), os.getenv("INSTAGRAM_SESSION"))

In [194]:
db = Database()
LLAMA = Inference(model=MODEL, token=os.getenv("HUGGING_FACE_TOKEN"))
start_date = datetime.today() - timedelta(days=7)
end_date = datetime.today()

In [195]:
accounts = db.getData(ACCOUNTS_TABLE, "club_name")
purge_date = datetime.today() - timedelta(days=90)
db.purgeData(ACCOUNTS_TABLE, purge_date)

APIResponse[TypeVar](data=[], count=None)

In [196]:
unfiltered_data = fetchData(accounts, start_date, end_date, L)

In [197]:
prompt_template = """
Given the following Instagram post data:

Account: {account}
Date: {date}
Caption: {caption}

Extract the following information and return it in this format with NO EXPLANATIONS:
- Account (as 'account')
- Posting Date (as 'posting_date')
- Type of post ('Competition', 'Networking', 'Workshop', 'Hiring', or 'Misc.')
- Relevant Dates (dates mentioned in the caption, EXCLUDE THE DATE POSTED, MUST BE IN YYYY/MM/DD FORMAT) If there are no relevant dates, put '' 

Only return the data in the response AS A DICTIONARY. DO NOT INCLUDE ANY EXPLANATIONS, CODE, OR RAW DATA.
PLEASE EXCLUDE ANY STATEMENTS SUCH AS "import re, import json, def extract_info, ''', #, Note: ", etc.

Response format: {{"account": "{account}", "posting_date": "{date}", "type": "<Type>", "relevant_dates": "<Relevant Dates>"}}

ONLY OUTPUT THE LIST, NO EXPLANATIONS, # COMMENTS, DUPLICATE DATA, NOTES, OR EXTRA TEXT.
"""

In [198]:

predictions = []

for post in unfiltered_data:
    predictions.append(LLAMA.predict_post(post=post))

In [None]:
#print(predictions[0])

In [201]:
to_upload = []
pattern = re.compile(r'\{.*\}')
for prediction in predictions:
    match = pattern.search(prediction)
    if match:
        dict = match.group(0)
        result = json.loads(dict)
        to_upload.append(result)


Load LLama model and Tuning Parameters (commented out)

In [204]:
#from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
#from huggingface_hub import login
#login(os.getenv("HUGGING_FACE_TOKEN_2"))
#tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
#model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
#generator = pipeline('text-generation', model=model, tokenizer=tokenizer, device=0)
#response = generator(prompt_template, max_length=250, temperature=0.01, top_p=0.85, top_k=1, num_return_sequences=1)