In [8]:
from google.colab import drive
drive.mount('/content/drive')
# import os
# os.makedirs('/content/drive/MyDrive/sentiment_project', exist_ok=True)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string

In [10]:
# Load the dataset  | we will start with 1000000
def load_json_lines_to_df(filepath: str, max_lines=1000000):
    data = []
    with open(filepath, 'r') as file:
        for i, line in enumerate(file):
            if max_lines is not None and i >= max_lines:
                break
            try:
                data.append(json.loads(line.strip()))
            except json.JSONDecodeError as e:
                print(f"Error decoding line {i}: {e}")
    return pd.DataFrame(data)

In [11]:
# Get the sentiment from the rating ( rating-based sentiment analysis)
def get_sentiment(score):
    if score >= 4:
        return 'positive'
    elif score == 3:
        return 'neutral'
    else:
        return 'negative'

In [12]:
# Clean : lower case only, no punctuation, no numbers, no multiple space
def clean_text(text):
    if not isinstance(text, str) or text.strip() == '':
        return None
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [13]:
df=load_json_lines_to_df("/content/drive/MyDrive/sentiment_project/Software.jsonl")

In [14]:
df.info()
df.describe()
df.shape
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 10 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   rating             1000000 non-null  float64
 1   title              1000000 non-null  object 
 2   text               1000000 non-null  object 
 3   images             1000000 non-null  object 
 4   asin               1000000 non-null  object 
 5   parent_asin        1000000 non-null  object 
 6   user_id            1000000 non-null  object 
 7   timestamp          1000000 non-null  int64  
 8   helpful_vote       1000000 non-null  int64  
 9   verified_purchase  1000000 non-null  bool   
dtypes: bool(1), float64(1), int64(2), object(6)
memory usage: 69.6+ MB


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,1.0,malware,mcaffee IS malware,[],B07BFS3G7P,B0BQSK9QCF,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,1562182632076,0,False
1,5.0,Lots of Fun,I love playing tapped out because it is fun to...,[],B00CTQ6SIG,B00CTQ6SIG,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,1424120336000,0,True
2,5.0,Light Up The Dark,I love this flashlight app! It really illumin...,[],B0066WJLU6,B0066WJLU6,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,1362399267000,0,True
3,4.0,Fun game,One of my favorite games,[],B00KCYMAWK,B00KCYMAWK,AH6CATODIVPVUOJEWHRSRCSKAOHA,1561061428662,0,True
4,4.0,I am not that good at it but my kids are,Cute game. I am not that good at it but my kid...,[],B00P1RK566,B00P1RK566,AEINY4XOINMMJCK5GZ3M6MMHBN6A,1418257196000,0,True


In [15]:
df.isna().sum()

Unnamed: 0,0
rating,0
title,0
text,0
images,0
asin,0
parent_asin,0
user_id,0
timestamp,0
helpful_vote,0
verified_purchase,0


In [None]:
df.describe()

Unnamed: 0,rating,timestamp,helpful_vote
count,2000000.0,2000000.0,2000000.0
mean,3.907688,1483010000000.0,4.599215
std,1.461368,89451620000.0,30.3279
min,1.0,943330900000.0,0.0
25%,3.0,1419959000000.0,0.0
50%,5.0,1473689000000.0,0.0
75%,5.0,1552263000000.0,2.0
max,5.0,1681509000000.0,6178.0


In [16]:
df['label'] = df['rating'].apply(get_sentiment)

In [17]:
df['verified_purchase'] = df['verified_purchase'].astype(int)

In [18]:
df['full_text'] = (df['title'].fillna('') + ' ' + df['text'].fillna('')).str.strip()

In [19]:
df['full_text'] = df['full_text'].apply(clean_text)

In [20]:
df = df[df['full_text'].notna()]
df = df[df['full_text'].str.strip() != '']

In [21]:
df = df.drop(columns=['asin', 'parent_asin', 'user_id', 'images'])

In [22]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(df['label'])

In [24]:
le.classes_  # negative : 0, neutral : 1, positive : 2

array(['negative', 'neutral', 'positive'], dtype=object)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 999795 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   rating             999795 non-null  float64
 1   title              999795 non-null  object 
 2   text               999795 non-null  object 
 3   timestamp          999795 non-null  int64  
 4   helpful_vote       999795 non-null  int64  
 5   verified_purchase  999795 non-null  int64  
 6   label              999795 non-null  object 
 7   full_text          999795 non-null  object 
dtypes: float64(1), int64(3), object(4)
memory usage: 68.7+ MB


In [26]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import logging
from tqdm import tqdm
import torch

# Suppress warnings
logging.getLogger("transformers").setLevel(logging.ERROR)

# Check for GPU
device = 0 if torch.cuda.is_available() else -1
print("Using device:", "GPU" if device == 0 else "CPU")

# Load model and tokenizer
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Create classifier pipeline
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device, truncation=True)

# Prepare text
texts = df['full_text'].astype(str).tolist()
batch_size = 64  # adjust based on GPU RAM
predictions = []

for i in tqdm(range(0, len(texts), batch_size)):
    batch = texts[i:i+batch_size]
    try:
        results = classifier(batch, truncation=True)
        labels = [r['label'] for r in results]
        predictions.extend(labels)
    except Exception as e:
        print(f"Error at batch {i}: {e}")
        predictions.extend([None] * len(batch))

Using device: GPU


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

100%|██████████| 15622/15622 [1:23:35<00:00,  3.11it/s]


In [28]:
df['sentiment_bert'] = predictions

In [29]:
df.head(1000000)

Unnamed: 0,rating,title,text,timestamp,helpful_vote,verified_purchase,label,full_text,sentiment_bert
0,1.0,malware,mcaffee IS malware,1562182632076,0,0,negative,malware mcaffee is malware,NEGATIVE
1,5.0,Lots of Fun,I love playing tapped out because it is fun to...,1424120336000,0,1,positive,lots of fun i love playing tapped out because ...,POSITIVE
2,5.0,Light Up The Dark,I love this flashlight app! It really illumin...,1362399267000,0,1,positive,light up the dark i love this flashlight app i...,POSITIVE
3,4.0,Fun game,One of my favorite games,1561061428662,0,1,positive,fun game one of my favorite games,POSITIVE
4,4.0,I am not that good at it but my kids are,Cute game. I am not that good at it but my kid...,1418257196000,0,1,positive,i am not that good at it but my kids are cute ...,POSITIVE
...,...,...,...,...,...,...,...,...,...
999995,5.0,Amazing,"This game is so fun, I love it.",1434671221000,0,1,positive,amazing this game is so fun i love it,POSITIVE
999996,1.0,terrible in so many ways! :(,"My little sister got this game, so I decided t...",1420216104000,17,1,negative,terrible in so many ways my little sister got ...,NEGATIVE
999997,1.0,glitches your worlds,"This app is pretty great, but sometimes it wil...",1420127748000,5,1,negative,glitches your worlds this app is pretty great ...,NEGATIVE
999998,2.0,great! unless you don't have Facebook,"So, I had a preteen and a toddler. I was on l...",1418903383000,0,1,negative,great unless you dont have facebook so i had a...,NEGATIVE


In [32]:
# Normalize your original labels to lowercase for consistency
df['label'] = df['label'].str.lower()

# Normalize BERT labels to match your original format
df['sentiment_bert'] = df['sentiment_bert'].str.lower()

In [34]:
df['label_match'] = df['label'] == df['sentiment_bert']

In [38]:
# View mismatches
mismatches = df[df['label_match'] == False]
print(f"Number of mismatches: {len(mismatches)}")

pd.set_option('display.max_colwidth', None)  # So full text is shown
mismatches[['full_text', 'label', 'sentiment_bert']]

Number of mismatches: 228164


Unnamed: 0,full_text,label,sentiment_bert
7,great antivirus product not sure what else can be said about norton products i have used them for years on all my desk and laptop computers easy to install and protects my harddrives well not sure i see what one of the other reviewers stated about upsells from norton havent seen them yet even after almost weeks of daily use anyway if you are looking for antivirus software that can be easily installed on three of your devices this is for you,positive,negative
11,great product i had a trial version of microsoft office which came with my laptop so i was in the market for the full blown version that would work with my vista system this product was easy to use and install and walked me throught the entire process i would highly recommend to anyone,positive,negative
14,mobileme upgrade again cheaper to buy the amazon upgrade versus paining apples high pricebr to be honest i wonder at times why i do this every year the features really arent any better each year i dont have a lot backed up but do have somebr i havent hung a website up so whybr keeps an email available to me my wife and daughter to communicate with yes i have gmail and hotmail but both have been spammed and taken over even w password changesbr so i pay the fee and upgrade thanks to amazon,positive,negative
19,silly penguin lot of fun but can be very frustrating only because im a moron times while playing the gamemade me laugh myself and try harderjust a fun simple game,positive,negative
24,reef rescue i like this game ok but wish there were less that you must purchase eh ill play until i get tired of the pay for this pay for that eh wish it could be the good old days and you could play a game for fun,positive,negative
...,...,...,...
999981,fun but too many ads i really enjoy this game but find the frequent ads extremely annoying i now get an ad after about every third game i also own this on another tablet and i dont get any ads on there a game i purchased should not interrupt play so often with unwanted and intrusive ads i may have to stop playing this game on this device,neutral,negative
999982,not a real youtube app i don’t think this is an actual youtube app it feels like the youtube website cramped into a app application it docent have a lot of the youtube app features like brining down the current video to brows for more videos you have to literally tap the back button it feels like a desktop version of the website honestly,neutral,negative
999985,u s a has some things that i would read,neutral,positive
999986,could be better this game is slow to load on my kindle it constantly freezes and reloads when you buy items through amazon wild card for example it reloads causing you to loss a game also the wild card doesnt always appear,neutral,negative




In [None]:
df.to_csv('/content/drive/MyDrive/sentiment_project/processed_reviews_1m.csv', index=False)
