In [9]:
# Clone the GitHub repository
!git clone https://github.com/Erum-hub/LHL_LLM.git

# Change directory to the cloned repository
%cd LHL_LLM

fatal: destination path 'LHL_LLM' already exists and is not an empty directory.
/content/LHL_LLM


In [10]:
import pandas as pd
import re

In [11]:
import csv

# Open and read the CSV file
with open('/content/LHL_LLM/Customer_Reviews.csv', newline='', encoding='utf-8') as file:
    reader = csv.reader(file)

    # Read the header
    headers = next(reader)
    print("Headers:", headers)

Headers: ['', 'book name', 'review title', 'reviewer', 'reviewer rating', 'review description', 'is_verified', 'date', 'timestamp', 'ASIN', 'Author']


In [12]:
# Load dataset
df = pd.read_csv('/content/LHL_LLM/Customer_Reviews.csv')

In [13]:
# Inspect
print(df.head())

   Unnamed: 0                                          book name  \
0          17  Friends, Lovers, and the Big Terrible Thing: A...   
1         131                      Lessons in Chemistry: A Novel   
2         464                           Flash Cards: Sight Words   
3         644  A Court of Mist and Fury (A Court of Thorns an...   
4          78  The Ballad of Songbirds and Snakes (A Hunger G...   

                                        review title          reviewer  \
0                                    A very sad read  Veronica R Ewing   
1                    I LOVE THIS BOOK!! üòç ‚≠êÔ∏è‚≠êÔ∏è‚≠êÔ∏è‚≠êÔ∏è‚≠êÔ∏è             Sonia   
2                     Amazing for struggling readers     Ryan Williams   
3  The ending was stunning, as always, but I had ...          Brittany   
4                                         So Good!!!   Kindle Customer   

   reviewer rating                                 review description  \
0                4  What a shock to lose such a ta

In [14]:
# Count unique values in the 'review title' column
unique_count = df['review title'].nunique()
print(f"Number of unique review titles: {unique_count}")

Number of unique review titles: 819


In [15]:
top_titles = df['review title'].value_counts().head(10)
print(top_titles)


review title
Great book     14
Cute book      10
Book            6
Cute            5
Love it         4
Great book!     4
Great           4
Perfect         3
Loved it!       3
Love            3
Name: count, dtype: int64


In [16]:
top_title_list = top_titles.index.tolist()
df['title_group'] = df['review title'].apply(lambda x: x if x in top_title_list else 'Other')


In [17]:
# Combine review title and description
df['text'] = df['review title'].fillna('') + '. ' + df['review description'].fillna('')

# Clean text
def clean_text(text):
    text = re.sub(r"[^a-zA-Z\s]", "", str(text))
    return text.lower().strip()

df['clean_text'] = df['text'].apply(clean_text)


In [21]:
# prompt: how to check header of above df

df.head()

Unnamed: 0.1,Unnamed: 0,book name,review title,reviewer,reviewer rating,review description,is_verified,date,timestamp,ASIN,Author,title_group,text,clean_text,clean_text_nostop
0,17,"Friends, Lovers, and the Big Terrible Thing: A...",A very sad read,Veronica R Ewing,4,What a shock to lose such a talented and funny...,True,30-10-2023,"Reviewed in the United States October 30, 2023",1250866448,Matthew Perry,Other,A very sad read. What a shock to lose such a t...,a very sad read what a shock to lose such a ta...,sad read shock lose talented funny actor young...
1,131,Lessons in Chemistry: A Novel,I LOVE THIS BOOK!! üòç ‚≠êÔ∏è‚≠êÔ∏è‚≠êÔ∏è‚≠êÔ∏è‚≠êÔ∏è,Sonia,5,"Oh, my God!! I LOVE THIS BOOK SO, SO, SO MUCH!...",True,24-10-2023,"Reviewed in the United States October 24, 2023",038554734X,Bonnie Garmus,Other,"I LOVE THIS BOOK!! üòç ‚≠êÔ∏è‚≠êÔ∏è‚≠êÔ∏è‚≠êÔ∏è‚≠êÔ∏è. Oh, my God!! ...",i love this book oh my god i love this book ...,love book oh god love book much stars definite...
2,464,Flash Cards: Sight Words,Amazing for struggling readers,Ryan Williams,5,I bought these for my son who was struggling r...,True,29-09-2023,"Reviewed in the United States September 29, 2023",1338233580,Scholastic,Other,Amazing for struggling readers. I bought these...,amazing for struggling readers i bought these ...,amazing struggling readers bought son struggli...
3,644,A Court of Mist and Fury (A Court of Thorns an...,"The ending was stunning, as always, but I had ...",Brittany,4,** Warning: This is NOT a spoiler-free review ...,True,29-06-2016,"Reviewed in the United States June 29, 2016",1635575583,Sarah J. Maas,Other,"The ending was stunning, as always, but I had ...",the ending was stunning as always but i had ch...,ending stunning always character issues warnin...
4,78,The Ballad of Songbirds and Snakes (A Hunger G...,So Good!!!,Kindle Customer,5,"If you loved the Hunger Games, you have to rea...",True,29-10-2023,"Reviewed in the United States October 29, 2023",1339016575,Suzanne Collins,Other,"So Good!!!. If you loved the Hunger Games, you...",so good if you loved the hunger games you have...,good loved hunger games read couldnt put loved...


In [None]:
print(df.columns.tolist())


In [19]:
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already present
nltk.download('stopwords')

# Define stopword set
stop_words = set(stopwords.words('english'))

# Function to remove stopwords
def remove_stopwords(text):
    words = text.split()
    filtered = [word for word in words if word not in stop_words]
    return ' '.join(filtered)

# Apply to cleaned text
df['clean_text_nostop'] = df['clean_text'].apply(remove_stopwords)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Vectorize text
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_text_nostop'])
y = df['label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


KeyError: 'label'

In [18]:
# Save cleaned data
df[['clean_text', 'label']].dropna().to_csv('cleaned_reviews.csv', index=False)

KeyError: "['label'] not in index"

In [None]:
# View the first few rows and header of the cleaned CSV
df_cleaned = pd.read_csv('cleaned_reviews.csv')
print("\nCleaned Reviews Header and first few rows:")
print(df_cleaned.head())

In [None]:
import matplotlib.pyplot as plt

df_cleaned['label'].value_counts().plot(kind='bar', color=['red', 'green'])
plt.xticks([0, 1], ['Negative', 'Positive'], rotation=0)
plt.title('Sentiment Distribution')
plt.ylabel('Count')
plt.show()


Representation - Tokenization

In [None]:
import pandas as pd
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
import torch

# Load cleaned data
df = pd.read_csv('cleaned_reviews.csv')

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size=0.2, random_state=42)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(list(X_train), truncation=True, padding=True, return_tensors="pt")
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, return_tensors="pt")

train_labels = torch.tensor(y_train.values)
test_labels = torch.tensor(y_test.values)

# Save for next step
torch.save((train_encodings, train_labels), 'train_data.pt')
torch.save((test_encodings, test_labels), 'test_data.pt')


3- Pretrained_Model

In [None]:
from transformers import AutoModelForSequenceClassification

# Load BERT for binary classification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


# 4. **Optimization**

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from sklearn.metrics import accuracy_score
from transformers import AutoModelForSequenceClassification


In [None]:

# Set random seed for reproducibility
torch.manual_seed(42)


In [None]:

# Load data (ensure weights_only=False for PyTorch 2.6+)
train_encodings, train_labels = torch.load('train_data.pt', weights_only=False)
test_encodings, test_labels = torch.load('test_data.pt', weights_only=False)


In [None]:
# Dataset class
class ReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }
    def __len__(self):
        return len(self.labels)



In [None]:
# Create datasets and loaders
train_dataset = ReviewDataset(train_encodings, train_labels)
test_dataset = ReviewDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)



In [None]:
# Load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)



In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import accuracy_score

# Load tokenized data (make sure you saved it with weights_only=False)
train_encodings, train_labels = torch.load('train_data.pt', weights_only=False)
test_encodings, test_labels = torch.load('test_data.pt', weights_only=False)

# Define custom dataset
class ReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }
    def __len__(self):
        return len(self.labels)

# Create dataset objects
train_dataset = ReviewDataset(train_encodings, train_labels)
test_dataset = ReviewDataset(test_encodings, test_labels)

# ‚úÖ Reduce batch size to avoid memory issues
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4)

# Load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)


In [None]:
# Evaluation
model.eval()
preds, true_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1)
        preds.extend(predictions.cpu().numpy())
        true_labels.extend

## 5. ***deployment***

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Load trained weights if saved
# model.load_state_dict(torch.load("model.pt"))  # Optional

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
