In [2]:
import polars as pl
from tqdm import tqdm
import openai

## Get data with Reddit jokes

In [None]:
# Load the CSV file
cons_red_df = pl.read_csv("consice_reddit.csv")
print(cons_red_df.head())


In [13]:
# Create a new column for the whole joke
cons_red_df = cons_red_df.with_columns([
    (pl.col("title") + "\n" + pl.col("selftext")).alias("whole_joke")
])

In [None]:
print(cons_red_df["whole_joke"][1])

## Let GPT-3.5 rate funnyness

In [15]:
# Do not store API keys in text in prod environments :)
openai_base_url = ""
openai_api_key = ""

# Initialize a client
client = openai.OpenAI(api_key=openai_api_key, base_url=openai_base_url)

In [None]:

prompt_for_rating_funniness = """As a professional joke rater, you rate jokes from 0 (not funny at all), to 5 (died laughing). Only output the rating number.
Please rate this joke:\n"""

def open_ai_gateway(prompt):
    # Create a chat completion
    chat_completion = client.chat.completions.create(
        messages=[{
                "role": "user",
                "content": prompt,
        }],
        model="gpt-3.5-turbo",
    )
    return chat_completion.choices[0].message.content 

# Create a new column in the DataFrame to store the ratings
cons_red_df = cons_red_df.with_columns([pl.lit(None).alias("funniness_rating")])

# Iterate over the DataFrame and get the ratings with a progress bar
ratings = []
for row in tqdm(cons_red_df.iter_rows(), total=cons_red_df.height, desc="Processing jokes"):
    curr_prompt = prompt_for_rating_funniness + row[cons_red_df.columns.index("whole_joke")]
    rating = open_ai_gateway(curr_prompt)
    ratings.append(rating)

cons_red_df = cons_red_df.with_columns([pl.Series("funniness_rating", ratings)])

print(cons_red_df)


## Let GPT explain the joke

In [None]:
prompt_for_explanation = """As a professional joke explainer, please provide an explanation for this joke:
"""

# Create a new column in the DataFrame to store the explanations
cons_red_df = cons_red_df.with_columns([pl.lit(None).alias("joke_explanation")])

# Iterate over the DataFrame and get the explanations with a progress bar
explanations = []
for row in tqdm(cons_red_df.iter_rows(), total=cons_red_df.height, desc="Processing jokes"):
    curr_prompt = prompt_for_explanation + row[cons_red_df.columns.index("whole_joke")]
    explanation = open_ai_gateway(curr_prompt)
    explanations.append(explanation)

cons_red_df = cons_red_df.with_columns([pl.Series("joke_explanation", explanations)])


In [None]:
print(cons_red_df)

In [None]:
# Display specific cell
print(cons_red_df[2, -3])

# Your turn

Turn the above code into a Kedro pipeline! You can find the relevant code under `src/reddit_summarizer/pipelines/data_processing`. Start by initializing the data, by running `init_data.ipynb`.

You can visually inspecting the pipeline with `kedro viz run --autoreload`. When ready to test pipelines, simply run `kedro run`.

During your development process, it can be helpful to inspect what is saved in the different steps of the pipelines – this can be done in the notebook under `notebooks/kedro_catalog.ipynb`.

## Bonus points I

Refactor the Summarize node into two distinct nodes, one for explanation, and one for joke rating

## Bonus points II

As a final step, plot the joke ratings in a Plotly chart, and have it displayed in Kedro Viz!

## Bonus points III

Come up with and implement more uses of LLMs to augment the rows in the given dataset