In [19]:
from google.colab import drive
drive.mount('/content/drive')
# import os
# os.makedirs('/content/drive/MyDrive/sentiment_project', exist_ok=True)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string

In [21]:
# Load the dataset  | we will start with 2000000
def load_json_lines_to_df(filepath: str, max_lines=2000000):
    data = []
    with open(filepath, 'r') as file:
        for i, line in enumerate(file):
            if max_lines is not None and i >= max_lines:
                break
            try:
                data.append(json.loads(line.strip()))
            except json.JSONDecodeError as e:
                print(f"Error decoding line {i}: {e}")
    return pd.DataFrame(data)

In [22]:
# Get the sentiment from the rating ( rating-based sentiment analysis)
def get_sentiment(score):
    if score >= 4:
        return 'positive'
    elif score == 3:
        return 'neutral'
    else:
        return 'negative'

In [23]:
# Clean : lower case only, no punctuation, no numbers, no multiple space
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [25]:
df=load_json_lines_to_df("/content/drive/MyDrive/sentiment_project/Software.jsonl")

In [26]:
df.info()
df.describe()
df.shape
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 10 columns):
 #   Column             Dtype  
---  ------             -----  
 0   rating             float64
 1   title              object 
 2   text               object 
 3   images             object 
 4   asin               object 
 5   parent_asin        object 
 6   user_id            object 
 7   timestamp          int64  
 8   helpful_vote       int64  
 9   verified_purchase  bool   
dtypes: bool(1), float64(1), int64(2), object(6)
memory usage: 139.2+ MB


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,1.0,malware,mcaffee IS malware,[],B07BFS3G7P,B0BQSK9QCF,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,1562182632076,0,False
1,5.0,Lots of Fun,I love playing tapped out because it is fun to...,[],B00CTQ6SIG,B00CTQ6SIG,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,1424120336000,0,True
2,5.0,Light Up The Dark,I love this flashlight app! It really illumin...,[],B0066WJLU6,B0066WJLU6,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,1362399267000,0,True
3,4.0,Fun game,One of my favorite games,[],B00KCYMAWK,B00KCYMAWK,AH6CATODIVPVUOJEWHRSRCSKAOHA,1561061428662,0,True
4,4.0,I am not that good at it but my kids are,Cute game. I am not that good at it but my kid...,[],B00P1RK566,B00P1RK566,AEINY4XOINMMJCK5GZ3M6MMHBN6A,1418257196000,0,True


In [27]:
df.isna().sum()

Unnamed: 0,0
rating,0
title,0
text,0
images,0
asin,0
parent_asin,0
user_id,0
timestamp,0
helpful_vote,0
verified_purchase,0


In [28]:
df.describe()

Unnamed: 0,rating,timestamp,helpful_vote
count,2000000.0,2000000.0,2000000.0
mean,3.907688,1483010000000.0,4.599215
std,1.461368,89451620000.0,30.3279
min,1.0,943330900000.0,0.0
25%,3.0,1419959000000.0,0.0
50%,5.0,1473689000000.0,0.0
75%,5.0,1552263000000.0,2.0
max,5.0,1681509000000.0,6178.0


In [29]:
df['label'] = df['rating'].apply(get_sentiment)

In [30]:
df['verified_purchase'] = df['verified_purchase'].astype(int)

In [31]:
df['full_text'] = (df['title'].fillna('') + ' ' + df['text'].fillna('')).str.strip()

In [32]:
df['full_text'] = df['full_text'].apply(clean_text)

In [33]:
df = df[df['full_text'].notna()]
df = df[df['full_text'].str.strip() != '']

In [34]:
df = df.drop(columns=['asin', 'parent_asin', 'user_id', 'images'])

In [35]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(df['label'])  # 'positive', 'neutral', 'negative' → 0, 1, 2

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1999670 entries, 0 to 1999999
Data columns (total 8 columns):
 #   Column             Dtype  
---  ------             -----  
 0   rating             float64
 1   title              object 
 2   text               object 
 3   timestamp          int64  
 4   helpful_vote       int64  
 5   verified_purchase  int64  
 6   label              object 
 7   full_text          object 
dtypes: float64(1), int64(3), object(4)
memory usage: 137.3+ MB


In [38]:
df.to_csv('/content/drive/MyDrive/sentiment_project/processed_reviews.csv', index=False)
