In [14]:
import asyncio
from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession
from sqlalchemy.orm import sessionmaker
from sqlalchemy.future import select
from statsmodels.api import OLS
import pandas as pd
from sqlalchemy import and_

from fleecekmbackend.db.models import Author, Answer, Rating
from fleecekmbackend.core.config import DATABASE_URL

# Setting up the async engine and session
engine = create_async_engine(DATABASE_URL, echo=False)
async_session = sessionmaker(engine, expire_on_commit=False, class_=AsyncSession)


async def fetch_data():
    async with async_session() as session:
        # Select answers authored by 'meta-llama/Meta-Llama-3-70B-Instruct' with their ratings
        results = await session.execute(
            select(Answer, Rating)
            .join(Rating, Answer.id == Rating.answer_id)
            .join(Author, Author.id == Answer.author_id)
            .where(Author.model == "meta-llama/Meta-Llama-3-70B-Instruct")
        )
        return results.all()
    
def prepare_data(data):
    # Create a DataFrame
    df = pd.DataFrame([{
        'setting': answer.setting,
        'rating_value': rating.value
    } for answer, rating in data])
    
    # Convert setting to a categorical variable
    df['setting'] = df['setting'].astype('category').cat.codes
    return df
    
data = await fetch_data()
prepared_data = prepare_data(data)
prepared_data

Unnamed: 0,setting,rating_value
0,0,5
1,0,5
2,0,5
3,0,5
4,0,5
...,...,...
765777,1,2
765778,0,5
765779,0,5
765780,0,5


In [15]:
setting_freq = prepared_data['setting'].value_counts()
rating_value_freq = prepared_data['rating_value'].value_counts()

print("Frequency of values in 'setting' column:")
print(setting_freq)

print("\nFrequency of values in 'rating_value' column:")
print(rating_value_freq)

Frequency of values in 'setting' column:
setting
0    764057
1      1725
Name: count, dtype: int64

Frequency of values in 'rating_value' column:
rating_value
5    727476
4     33385
2      3311
1       937
3       616
0        57
Name: count, dtype: int64


In [16]:
from sqlalchemy import func

async def get_setting_counts():
    async with async_session() as session:
        # Query to count each type of setting
        results = await session.execute(
            select(Answer.setting, func.count(Answer.setting))
            .group_by(Answer.setting)
        )
        return results.all()

# Example of how to run the function in an asyncio environment
# Since we are in Jupyter, you can directly run this with `await` in a cell
counts = await get_setting_counts()
print(counts)

[('ic', 859725), ('human', 36), ('zs', 1736)]


In [None]:

async def perform_regression(df):
    # Performing regression
    X = df[['setting']]  # Independent variable
    y = df['rating_value']  # Dependent variable
    model = OLS(y, X).fit()
    return model.summary()