In [None]:
import pandas as pd
import gzip
import json

def get_data(path):
    data = []
    if path.endswith('.gz'):
        with gzip.open(path, 'rb') as f:
            for line in f:
                data.append(json.loads(line))
                
    elif path.endswith('.csv'):
        return pd.read_csv(path)
    
    else: 
        with open(path, 'r') as f:
            for line in f:
                data.append(json.loads(line))
    return data

In [None]:
def classify_sentiment(text):
    result = pipe(text, truncation=True, max_length=512)
    
    return result[0]['label'], result[0]['score']

In [None]:
def check_sentiment_label(data_):    
    # Đổi map object thành int
    sentiment_map = {
        '5 stars': 5.0,
        '4 stars': 4.0,
        '3 stars': 3.0,
        '2 stars': 2.0,
        '1 star': 1.0
    }
    
    # Create columns sentiment_label_int
    data_.loc[:, 'sentiment_label_int'] = data_['sentiment_label'].map(sentiment_map)
    
    data_ = data_[['reviewerID','asin', 'overall', 'sentiment_label_int']]
    
    for index, row in data_.iterrows():
        if abs(row['overall'] - row['sentiment_label_int']) > 2:
            data_.at[index, 'sentiment_label_int'] = 3
        
    return data_

In [None]:
data_music = get_data('Digital_Music_5.json/Digital_Music_5.json')
data_movie = get_data('Movies_and_TV_5.json/Movies_and_TV_5.json')

In [None]:
data_music = pd.DataFrame(data_music)

In [None]:
data_extract_music = get_data('review_ranking.csv')

In [None]:
data_movie_batch1 = get_data('sentiment_data_batch_1_200k.csv')
data_movie_batch2 = get_data('sentiment_data_batch_2_200k.csv')
data_movie_batch3 = get_data('sentiment_data_batch_3_200k.csv')
data_movie_batch4 = get_data('sentiment_data_batch_4_200k.csv')
data_movie_batch5 = get_data('sentiment_data_batch_5_200k.csv')

In [None]:
data_movie_batch6 = get_data('sentiment_data_batch_6_200k.csv')
data_movie_batch7 = get_data('sentiment_data_batch_7_200k.csv')
data_movie_batch8 = get_data('sentiment_data_batch_8_200k.csv')
data_movie_batch9 = get_data('sentiment_data_batch_9_200k.csv')
data_movie_batch10 = get_data('sentiment_data_batch_10_200k.csv')

In [None]:
data_movie_1m4_3rd = get_data('/kaggle/input/data-sentiment/complete_sentiment_data_1M_3rd.csv')

In [None]:
data_movie = data_movie[['reviewerID', 'asin', 'reviewText', 'summary', 'overall']].dropna(subset=['reviewText'])

In [None]:
data_movie['reviewText'].count()

In [None]:
import torch
print(torch.cuda.is_available())
device = 0 if torch.cuda.is_available() else -1

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("text-classification", model="nlptown/bert-base-multilingual-uncased-sentiment", device=device)

-----------------------------------------

In [None]:
texts_music = data_music['reviewText']

In [None]:
texts = data_movie['reviewText']
index = 1000000

texts_1 = texts[: index]
df_1 = data_movie[: index]

texts_2 = texts[index: index*2]
df_2 = data_movie[index: index*2]

texts_3 = texts[index*2: ]
df_3 = data_movie[index*2: ]

-----------------------------------------

Data music

In [None]:
sentiment_labels = []
sentiment_scores = []

for i, text in enumerate(texts_music):
    label, score = classify_sentiment(text)
    sentiment_labels.append(label)
    sentiment_scores.append(score)
    
    if (i + 1) % 50000 == 0:
        print(f"Đã phân tích xong {i + 1} dữ liệu")

data_music['sentiment_label'] = sentiment_labels
data_music['sentiment_score'] = sentiment_scores

Data Movie

In [None]:
sentiment_labels = []
sentiment_scores = []

chunk_size = 200000

for i, text in enumerate(texts_3):
    label, score = classify_sentiment(text)
    sentiment_labels.append(label)
    sentiment_scores.append(score)
    
    if (i + 1) % chunk_size == 0:
        print(f"Đã phân tích xong {i + 1} dữ liệu")
        
        # Create a temporary DataFrame to hold the current batch
        temp_df = df_3.iloc[i+1-chunk_size:i+1].copy()
        temp_df['sentiment_label'] = sentiment_labels[-chunk_size:]
        temp_df['sentiment_score'] = sentiment_scores[-chunk_size:]
        
        # Save the current batch to a CSV file
        temp_df.to_csv(f'./sentiment_data_batch_{i // chunk_size + 11}.csv', index=False)
    
        print(f"Đã lưu {i + 1} dữ liệu hoàn tất")
        print("--------------------")
        print(" ")
    
    
# Finally, save the complete DataFrame if needed
df_3['sentiment_label'] = sentiment_labels
df_3['sentiment_score'] = sentiment_scores


print(f"Đã lưu toàn bộ dữ liệu hoàn tất")

-----------------------------------------

**Data Music**

In [None]:
data_music = check_sentiment_label(data_music)
data_music.to_csv('./review_ranking.csv', index=False)

**Data Movie**

Data 1 st

In [None]:
data_movie_batch1 = check_sentiment_label(data_movie_batch1)
data_movie_batch2 = check_sentiment_label(data_movie_batch2)
data_movie_batch3 = check_sentiment_label(data_movie_batch3)
data_movie_batch4 = check_sentiment_label(data_movie_batch4)
data_movie_batch5 = check_sentiment_label(data_movie_batch5)

In [None]:
data_movie_1m = [data_movie_batch1, data_movie_batch2, data_movie_batch3, data_movie_batch4, data_movie_batch5]
data_movie_1m_1st = pd.concat(data_movie_1m, ignore_index=True)

In [None]:
data_movie_1m_1st.to_csv('./data_movie_1m_1st.csv', index=False)

Data 2 nd

In [None]:
data_movie_batch6 = check_sentiment_label(data_movie_batch6)
data_movie_batch7 = check_sentiment_label(data_movie_batch7)
data_movie_batch8 = check_sentiment_label(data_movie_batch8)
data_movie_batch9 = check_sentiment_label(data_movie_batch9)
data_movie_batch10 = check_sentiment_label(data_movie_batch10)

In [None]:
data_movie_1m_2 = [data_movie_batch6, data_movie_batch7, data_movie_batch8, data_movie_batch9, data_movie_batch10]
data_movie_1m_2nd = pd.concat(data_movie_1m_2, ignore_index=True)

In [None]:
data_movie_1m_2nd.to_csv('./data_movie_1m_2nd.csv', index=False)

Data 3 rd

In [None]:
data_movie_1m4_3rd = check_sentiment_label(data_movie_1m4_3rd)

data_movie_1m4_3rd.to_csv('./data_movie_1m4_3rd.csv', index=False)