In [108]:
from huggingface_hub import InferenceClient
import pandas as pd
import os
import time
import json
from pathlib import Path


my_token='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'


def load_test_data():
    try:
        # Works in regular Python scripts
        base_dir = Path(__file__).resolve().parent
    except NameError:
        # Fallback for Jupyter notebooks and interactive shells
        base_dir = Path().resolve()

    # Now use it to build your file path
    data_path = base_dir / "data" / 'inputs' / "IMDB-movie-reviews.csv"

    # Read file
    data = pd.read_csv(data_path, sep=';', encoding='latin-1')


    data.rename(columns={'sentiment':'Target'}, inplace=True)

    data['review_index'] = data.index
    return data 


def get_sentiment(reviews):

    # Make sure input is a list (output of hf is 3 classes if a string is given, or just the top class if a list is given)
    if not isinstance(reviews, list):
        reviews = [reviews]
    client=InferenceClient( #3 seconds per input
        model='meta-llama/Meta-Llama-3-8B-Instruct',
        token=my_token
        )
    
    prompt="""Predict whether the following document is a positive or negative movie review:
    [REVIEW]

    If it is positive, return 1, and if it is negative return 0. Do not give any other answers.
    """

    # Inference for all reviews
    all_rows = []
    for i, review in enumerate(reviews):
        review = review[:4010] # truncation of long reviews to the maximum we have seen

        messages = [
            {"role": "system", "content": "You are an expert in movie reviews"},
            {"role": "user", "content": prompt.replace("[REVIEW]", review)},
        ]

        output = client.chat_completion(messages, max_tokens=20)
        print(i)
        time.sleep(5)
        output = output.choices[0].message.content
        all_rows.append({
            "review_index": i,
            "review" : review, 
            "positive_score": int(output)
        })

    # Create DataFrame
    df = pd.DataFrame(all_rows)

    outputs_list = ['positive' if score > 0.5 else 'negative' for score in df['positive_score']]
    df['Prediction']=outputs_list
    return df


def save_outputs(data, predictions, model_name, adaptations, inference_time, other_comments):

    # Make output dir
    try:
        # Works in regular Python scripts
        base_dir = Path(__file__).resolve().parent
    except NameError:
        # Fallback for Jupyter notebooks and interactive shells
        base_dir = Path().resolve()
    # Now use it to build file path to store predictions and metadata
    path_outputs = base_dir / "data" / 'outputs' / 'runs' / model_name
    # Create the directory (and parents if they don't exist)
    path_outputs.mkdir(parents=True, exist_ok=True)

    # Add target to the predictions df
    output_df=data.merge(predictions.drop(columns=['review']), how='left' , on='review_index')
    output_df.drop(columns=['review_index'], inplace=True)

    # Save predictions
    output_df.to_csv(path_outputs / 'predictions.csv', index=False, sep=';')

    # Create metadata file and save it
    metadata = { 
        'model':model_name,
        'adaptations': adaptations,
        'inference_time': inference_time,
        'other_comments': other_comments
    }
    # Save to JSON file
    with open(path_outputs / "metadata.json", "w") as f:
        json.dump(metadata, f, indent=4) 

In [106]:
model_name = 'generative-llama'
adaptations = ''
other_comments = ''



In [102]:
predictions = get_sentiment(list(data.review)[:3])

0
1
2


In [93]:
predictions

Unnamed: 0,review_index,review,positive_score,Prediction
0,0,One of the other reviewers has mentioned that ...,1,positive
1,1,A wonderful little production. <br /><br />The...,1,positive
2,2,I thought this was a wonderful way to spend ti...,1,positive


In [109]:
data = load_test_data()

start = time.time()
predictions = get_sentiment(list(data.review))
end = time.time()
inference_time = end - start

save_outputs(data, predictions, model_name, adaptations, inference_time, other_comments)

0
1
2
3
4
5
6
7
8
9
10
11
12
13


HfHubHTTPError: 402 Client Error: Payment Required for url: https://router.huggingface.co/featherless-ai/v1/chat/completions (Request ID: Root=1-6859a521-1138b3532cde58a43deb5e9a;b757eb28-5e9f-4c87-843b-42f81cd33dcf)

You have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits.

In [None]:

if __name__ == "__main__":
    data = load_test_data()

    start = time.time()
    predictions = get_sentiment(list(data.review))
    end = time.time()
    inference_time = end - start

    save_outputs(data, predictions, model_name, adaptations, inference_time, other_comments)
