In [2]:
from pyspark.sql import SparkSession
from transformers import pipeline
import json
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
import torch
import pandas as pd

In [None]:
import json
import pandas as pd
from langdetect import detect, DetectorFactory

# Set seed for reproducibility
DetectorFactory.seed = 0

# Load the JSON data
with open('bus_reviews.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(data)
df = df[['Bus Name', 'Comment']].rename(columns={'Bus Name': 'bus_name', 'Comment': 'comment'})

# Function to detect language
def detect_language(comment):
    try:
        return detect(comment)
    except:
        return None

# Split data into English and Vietnamese comments
df_vi = df[df['comment'].apply(lambda x: detect_language(x) == 'vi')].reset_index(drop=True)
df_en = df[df['comment'].apply(lambda x: detect_language(x) == 'en')].reset_index(drop=True)

# Extract bus names and comments
bus_names_vi = df_vi['bus_name'].tolist()
comments_vi = df_vi['comment'].tolist()

bus_names_en = df_en['bus_name'].tolist()
comments_en = df_en['comment'].tolist()

# Print results
print("Vietnamese Bus Names and Comments:")
for name, comment in zip(bus_names_vi, comments_vi):
    print(f"Bus Name: {name}, Comment: {comment}")

print("\nEnglish Bus Names and Comments:")
for name, comment in zip(bus_names_en, comments_en):
    print(f"Bus Name: {name}, Comment: {comment}")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Assuming 'comments_vi' is your list of sentences
sentences = comments_vi

# Load tokenizer and model
model_path = '5CD-AI/Vietnamese-Sentiment-visobert'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Encode sentences using batch_encode_plus
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

# Move inputs to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = {key: value.to(device) for key, value in inputs.items()}
model.to(device)

# Perform inference
with torch.no_grad():
    outputs = model(**inputs)
    scores = outputs.logits.softmax(dim=-1).cpu().numpy()

# Process and print results
for i, sentence in enumerate(sentences):
    print(f"Sentence: {sentence}")
    ranking = scores[i].argsort()[::-1]
    for rank in ranking:
        label = model.config.id2label[rank]
        score = scores[i][rank]
        print(f"  {label}: {score:.4f}")