In [None]:
!pip install langdetect

from pyspark.sql import SparkSession
from transformers import pipeline
import json
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from langdetect import detect, DetectorFactory
import numpy as np
import torch
import pandas as pd

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=794e28e051414e678c36c5c8c5bdb8f6ebf27a280e948c36017dff044f6978bd
  Stored in directory: /root/.cache/pip/wheels/0a/f2/b2/e5ca405801e05eb7c8ed5b3b4bcf1fcabcd6272c167640072e
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [None]:
# Set seed for reproducibility
DetectorFactory.seed = 0

# Load the JSON data
with open('bus_reviews.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(data)
df = df[['Bus Name', 'Comment']].rename(columns={'Bus Name': 'bus_name', 'Comment': 'comment'})

# Function to detect language
def detect_language(comment):
    try:
        return detect(comment)
    except:
        return None

# Split data into English and Vietnamese comments
df_vi = df[df['comment'].apply(lambda x: detect_language(x) == 'vi')].reset_index(drop=True)
df_en = df[df['comment'].apply(lambda x: detect_language(x) == 'en')].reset_index(drop=True)

Number of Vietnamese comments: 6713
Number of English comments: 611


In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Giả sử 'bus_names' và 'sentences' là danh sách các tên nhà xe và bình luận
bus_names = df_en['bus_name']
sentences = df_en['comment']

# Load tokenizer và model
model_path = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Di chuyển model vào GPU nếu có
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Định nghĩa batch size
batch_size = 16  # Điều chỉnh dựa trên bộ nhớ GPU

# Danh sách để lưu kết quả
results = []

# Xử lý câu theo batch
for start_idx in range(0, len(sentences), batch_size):
    batch_bus_names = bus_names[start_idx:start_idx + batch_size].tolist()
    batch_sentences = sentences[start_idx:start_idx + batch_size].tolist()

    # Encode sentences
    inputs = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Dự đoán
    with torch.no_grad():
        outputs = model(**inputs)
        scores = outputs.logits.softmax(dim=-1).cpu().numpy()

    # Lưu kết quả vào danh sách
    for i, (bus_name, sentence) in enumerate(zip(batch_bus_names, batch_sentences)):
        results.append({
            'Bus_Name': bus_name,
            'Comment': sentence,
            'POS': round(scores[i][model.config.label2id['POSITIVE']], 4),
            'NEG': round(scores[i][model.config.label2id['NEGATIVE']], 4)
            # 'NEU': round(scores[i][model.config.label2id['NEUTRAL']], 4)
        })

    # Giải phóng bộ nhớ GPU
    torch.cuda.empty_cache()

# Chuyển danh sách kết quả thành DataFrame
result_df = pd.DataFrame(results)

# Lưu vào file CSV
result_df.to_csv("sentiment_en_results.csv", index=False, encoding="utf-8-sig")

print("Đã lưu kết quả vào sentiment_en_results.csv")

Đã lưu kết quả vào sentiment_en_results.csv
