In [2]:
import pandas as pd
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model
from sklearn import metrics
import math
from transformers import pipeline
import torch
import json
import ollama
import csv
import time

In [3]:
df = pd.read_csv('filtered_data_.csv')
df.drop(columns=["user_id", "review_id", "book_id"], inplace=True)
df = df.dropna(subset=['review_text'])
df = df[(df["rating"] != 0)] # remove reviews with no rating
df["review_text"] = df["review_text"].astype(str)
df

Unnamed: 0,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments,type
0,4,Fun sequel to the original.,735919,735919,735079.8,735663.8,7,0,children
1,5,One of my favorite books to read to my 5 year ...,735545,736354,736353.0,735663.8,4,0,children
2,5,One of the best and most imaginative childrens...,735529,735529,735079.8,735663.8,6,1,children
3,5,My daughter is loving this. Published in the 6...,735204,736410,735079.8,735663.8,5,4,children
4,5,A friend sent me this. Hilarious!,734268,734531,734268.0,734268.0,5,0,children
...,...,...,...,...,...,...,...,...,...
39995,3,*3.5 stars*,735888,736092,735889.0,735888.0,26,2,young_adult
39996,3,Written in the Stars is actually a book I don'...,735792,736092,735793.0,735792.0,55,3,young_adult
39997,3,*2.5 stars* \r\n I originally rated this a 3.5...,735790,735827,735827.0,735819.0,25,7,young_adult
39998,3,*3.75 stars* \r\n I've never given that kind o...,735778,735781,735781.0,735778.0,11,0,young_adult


In [4]:
df.dtypes

rating            int64
review_text      object
date_added        int64
date_updated      int64
read_at         float64
started_at      float64
n_votes           int64
n_comments        int64
type             object
dtype: object

In [5]:
num_rows = df.shape[0]
num_cols = df.shape[1]
print(f'The number of rows in the DataFrame is: {num_rows}')
print(f'The number of columns in the DataFrame is: {num_cols}')

The number of rows in the DataFrame is: 38888
The number of columns in the DataFrame is: 9


In [5]:
# df5 = df[df.rating == 5]
# text5 = ''
# for line in df5.review_text:
#     text5 += line + " "
# wordcloud5 = WordCloud(background_color="white").generate(text5)
# plt.imshow(wordcloud5)
# plt.axis("off")

In [6]:

# df1 = df[df.rating == 1]
# text1 = ''
# for line in df1.review_text:
#     text1 += line + " "
# wordcloud1 = WordCloud(background_color="white").generate(text1)
# plt.imshow(wordcloud1)
# plt.axis("off")

# Predicting rating based on review_text

In [7]:
x = df['review_text']
y = df['rating']


In [8]:

# # Split the data into training and testing sets with random_state = 2024 to ensure reproducibility
# x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size = 0.2, random_state = 2024) 

# # To vectorize with min of 1 word and max of 2 words and a minimum document frequency of 10
# vectorizer = CountVectorizer(ngram_range = (1, 2), min_df=10)

# # Fitting (learn) the vectorizer with the vocab in the training set
# vectorizer.fit(x_train)

# # Transform the original training data and test data into document-term matrices
# # with the fitted vectorizer
# X_train = vectorizer.transform(x_train)
# X_test = vectorizer.transform(x_test)

# X_train.toarray()
# X_test.toarray()
# Y_train = np.array(y_train)
# Y_test = np.array(y_test)

# regr = linear_model.LinearRegression()
# regr.fit(X_train, Y_train)


In [9]:
# Y_pred = regr.predict(X_test)
# ssr = np.sum(Y_pred - np.mean(Y_test)) ** 2
# mse = metrics.mean_squared_error(Y_test, Y_pred)
# rmse = math.sqrt(mse)
# print('R^2 score: %.2f' % regr.score(X_test, Y_test))
# print('RMSE: %.2f' % rmse)

# Trying classification of the rating using ollama with LLM Llama 3.2

Inspired by [Text Classification with Large Language Models (LLMs)](https://hussainpoonawala.medium.com/text-classification-with-large-language-models-llms-a23c731a687e) by Hussain Poonawala.

In [1]:
i = 371

# Open a CSV file to write the JSON output
with open('predicted_ratings.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Real Rating', 'Predicted Rating', 'Assertion', 'Audit Reasons', 'Reason', 'Iteration', 'StartTime', 'EndTime', 'Duration'])

    while i <= 2500:
        # Record the start time
        start_time = time.time()
        prompt = f"""
            You are tasked with classifying a book review based on a human's rating on a scale of 1 to 5, reflecting their overall satisfaction with the book. The review is the following: "{df.iloc[i]['review_text']}".
            The review was posted on an online book reviewing platform. Use the following criteria to guide your classification:
            **Rating Scale**:
            1. 1 - Very Bad (Extremely dissatisfied)
            2. 2 - Bad (Mostly dissatisfied)
            3. 3 - Average (Mixed or neutral feelings)
            4. 4 - Good (Generally satisfied)
            5. 5 - Very Good (Highly satisfied)

            **Guidelines**:
            1. Read the review carefully, paying attention to both the explicit content and implied sentiments.
            2. Think like a human reviewer and infer the overall impression and satisfaction level of the user from the text.
            3. Avoid simple keyword matching or sentiment analysis. Instead, consider the deeper meaning and tone of the review.
            
            **Key Aspects to Consider**:
            - **Sentiment**: Is the overall tone positive, negative, or neutral?
            - **Engagement**: Does the reviewer seem deeply involved with the book or detached and indifferent?
            - **Plot and Character Satisfaction**: Are there complaints about the plot or characters that indicate dissatisfaction, or praise that suggests satisfaction?
            - **Recommendation Willingness**: Does the reviewer express a desire to recommend the book to others, or show reluctance?

            **Audit Reason Categories**:
            You must select one or more reasons that justify your classification, using the following categories:
            1. Positive Sentiment
            2. Negative Sentiment
            3. Neutral Sentiment
            4. Plot Dissatisfaction
            5. Character Dissatisfaction
            6. Lack of Engagement
            7. Recommendation Hesitation
            8. Pacing Issues
            9. Repetition (Feeling of Deja Vu)
            10. General Enjoyment

            **DOs**:
            1. Be specific and clear in your audit reasoning.
            2. Think like a human reviewer, understanding the overall impression, tone, and the context of the book being reviewed.
            3. Consider nuanced factors like pacing, emotional engagement, and depth of plot/character development.
            4. Use the Output Format provided below to structure your response.
            
            **DON'Ts**:
            1. Do not rely purely on word choices for classification.
            2. Avoid guesses or assumptions. Base your classification solely on the review content provided.
            3. Do not repeat the same audit reasons unless strongly justified by the review.
            4. Do not add any additional information before or after the Output Format, i really just want to see the json output.
            5. Do not include any extra characters in the output.

            **Output Format**:
            {{
                "predictedRating": "",
                "auditReasons": [],
                "reason": ""
            }}

            Examples:

            1. Review: 'One of my favorite books to read to my 5-year-old. Beautifully written and keeps them engaged the whole time.'
                Output: {{ predictedRating: "5", auditReasons: ['Positive Sentiment', 'General Enjoyment'], "reason": "The reviewer expresses clear satisfaction and high engagement, indicating a very positive experience." }}

            2. Review: 'Fun sequel to the original, but not quite as captivating.'
                Output: {{ predictedRating: "4", auditReasons: ['Positive Sentiment', 'Engagement Mismatch'], "reason": "While the review is generally positive, there is mention of a slight drop in engagement compared to the original." }}

            3. Review: 'I'm getting bored... it's the same book, over and over again. But now I'm invested in finding out what happens to these children.'
                Output: {{ predictedRating: "2", auditReasons: ['Negative Sentiment', 'Repetition'], "reason": "The review indicates dissatisfaction with the repetitive nature of the book, though a slight engagement remains." }}

            4. Review: 'The characters were well-written, but the pacing was just too slow. Took me a long time to finish.'
                Output: {{ predictedRating: "3", auditReasons: ['Neutral Sentiment', 'Pacing Issues'], "reason": "The reviewer had mixed feelings, appreciating character development but struggling with the pacing." }}

            Please follow the guidelines strictly and classify based on the full context of the review.
            """
        response = ollama.generate(model='llama3.2:latest', prompt=prompt)["response"]
        # Parse the response into JSON string
        try:
            parsed_json_response = json.loads(response)
        except json.JSONDecodeError as e:
            continue
        # Accessing the parsed JSON data
        predicted_rating = parsed_json_response["predictedRating"]
        audit_reasons = parsed_json_response["auditReasons"]
        reason = parsed_json_response["reason"]
        real_rating = df.iloc[i]['rating']
        try:
            assertion = real_rating == int(predicted_rating)
        except ValueError as e:
            print(f"Error converting predicted rating to int: {e}")
            continue
        print(f"Real Rating: {real_rating}, Predicted Rating: {predicted_rating}, Assertion: {assertion}")
        
        # Record the end time
        end_time = time.time()
        duration = end_time - start_time
        
        # Write the data to the CSV file
        writer.writerow([real_rating, predicted_rating, assertion, audit_reasons, reason, i, start_time, end_time, duration])
        file.flush()  # Ensure data is written to the file at each iteration
        
        i += 1

NameError: name 'csv' is not defined

In [6]:
df_predicted = pd.read_csv('predicted_ratings.csv')
df_predicted.head(10)

Unnamed: 0,Real Rating,Predicted Rating,Assertion,Audit Reasons,Reason,Iteration,StartTime,EndTime,Duration
0,2,3,False,['Neutral Sentiment'],"The review is neutral, neither overwhelmingly ...",79,1729377000.0,1729377000.0,3.654673
1,2,3,False,"['Neutral Sentiment', 'Lack of Engagement']",The reviewer seems neutral about their experie...,80,1729377000.0,1729377000.0,6.777081
2,4,5,False,"['Positive Sentiment', 'General Enjoyment']",The reviewer uses the phrase 'perfekt bok' (pe...,81,1729377000.0,1729377000.0,7.102875
3,5,5,True,"['Positive Sentiment', 'General Enjoyment']","The reviewer expresses clear satisfaction, men...",82,1729377000.0,1729377000.0,6.315136
4,4,4,True,"['Positive Sentiment', 'General Enjoyment']",The reviewer expresses general satisfaction wi...,83,1729377000.0,1729377000.0,7.613893
5,5,5,True,"['Positive Sentiment', 'Engagement', 'Plot Sat...",The reviewer expresses strong satisfaction wit...,84,1729377000.0,1729377000.0,12.490463
6,5,4,False,"['Positive Sentiment', 'Engagement Mismatch', ...",The reviewer expresses a generally positive se...,85,1729377000.0,1729377000.0,11.461255
7,4,5,False,"['Positive Sentiment', 'General Enjoyment', 'E...",The reviewer expresses clear satisfaction with...,86,1729377000.0,1729377000.0,7.734398
8,5,5,True,"['Positive Sentiment', 'General Enjoyment']",The reviewer uses positive language throughout...,87,1729377000.0,1729377000.0,8.052784
9,3,4,False,"['Positive Sentiment', 'General Enjoyment', 'P...",The reviewer expresses a generally positive se...,88,1729377000.0,1729377000.0,11.09968
