In [None]:
import pandas as pd

df_new = pd.read_csv('Philippine_Business_TrustPilot_Reviews_Labeled.csv', encoding='ISO-8859-1')
print("Available columns in new dataset:", df_new.columns.tolist())

Available columns in new dataset: ['Business Name', 'Business Average Rating', 'Business Review Grade', 'User Review Title', 'User Review Body', 'User Review Rating', 'User Review Date', 'User Review Count', 'User  Country', 'User Review Status', 'User Review Month', 'User Review Year', 'Ground Label']


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Load CSV (user must place it in data/)
# Updated filename and usecols based on the new dataset inspection
df = pd.read_csv('Philippine_Business_TrustPilot_Reviews_Labeled.csv', usecols=['User Review Body', 'Ground Label'], encoding='ISO-8859-1')

print("Initial shape:", df.shape)

# Combine title + text into 'content'
# 'content' will now be solely based on 'User Review Body'
df['content'] = df['User Review Body'].fillna('').str.strip()

# Rename 'Ground Label' to 'label' for consistency with the rest of the code
df = df.rename(columns={'Ground Label': 'label'})

# Inspect the values of the new 'label' column to determine if mapping is needed.
print("Value counts of 'label' before mapping adjustment:")
print(df['label'].value_counts())

# Convert labels to numeric, ignore invalid ones
# Map 'Positive' to 1 and 'Negative' to 0
df['label'] = df['label'].map({'Positive': 1, 'Negative': 0})

# Keep only rows with valid labels 0 or 1 and non-empty content
df = df[df['label'].isin([0, 1])]
df = df[df['content'] != '']

# Convert label to int
df['label'] = df['label'].astype(int)

print("Shape after cleaning:", df.shape)
print(df['label'].value_counts())

# Features & target
X = df['content']
y = df['label']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.7)),
    ('clf', LogisticRegression(max_iter=1000))
])

# Train
pipeline.fit(X_train, y_train)

# Evaluate
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save pipeline
joblib.dump(pipeline, 'Philippine_Business_TrustPilot_Reviews_Labeled.pkl')
print("Pipeline saved as 'Philippine_Business_TrustPilot_Reviews_Labeled.pkl'")

Initial shape: (10916, 2)
Value counts of 'label' before mapping adjustment:
label
Positive    7181
Negative    3517
Neutral      218
Name: count, dtype: int64
Shape after cleaning: (10698, 3)
label
1    7181
0    3517
Name: count, dtype: int64
Accuracy: 0.9490654205607477
              precision    recall  f1-score   support

           0       0.96      0.88      0.92       704
           1       0.94      0.98      0.96      1436

    accuracy                           0.95      2140
   macro avg       0.95      0.93      0.94      2140
weighted avg       0.95      0.95      0.95      2140

Pipeline saved as 'Philippine_Business_TrustPilot_Reviews_Labeled.pkl'
