In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [None]:
sentiment_datasets_processed_path = r'..\data\processed\sentiment_processed.csv'
sentiment_datasets_processed_df = pd.read_csv(sentiment_datasets_processed_path)

sentiment_datasets_processed_df = sentiment_datasets_processed_df.dropna()

## Splitting data

In [None]:
X = sentiment_datasets_processed_df['text']
Y = sentiment_datasets_processed_df['label']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state= 0, stratify=Y)

## Encoder label

In [None]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test =label_encoder.transform(y_test)

## Vectorlization

In [None]:
tfidf = TfidfVectorizer(max_features= 10000)

x_train_tfidf = tfidf.fit_transform(x_train)
x_test_tfidf = tfidf.transform(x_test)

## Select Model/ Trainning

In [None]:
model_LR = LogisticRegression(
    max_iter= 1000,
    C= 0.5,
    penalty='l2',
    solver='lbfgs',
    class_weight='balanced'
)
model_LR.fit(x_train_tfidf, y_train)

## Evaluation

In [None]:
y_pred = model_LR.predict(x_test_tfidf)

print(f"accuracy_score: {accuracy_score(y_test, y_pred)}")

print(f"classification_report:\n {classification_report(y_test, y_pred, target_names= label_encoder.classes_)}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Mapping số -> nhãn
label_mapping = {0: "Negative", 1: "Neutral", 2: "Positive"}
labels = ["Negative", "Neutral", "Positive"]

# Tạo confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=[0,1,2])

# Vẽ heatmap
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


## Save model_trained

In [None]:
import joblib

In [None]:
joblib.dump ((tfidf, label_encoder, model_LR), r"..\models\sentiment_pipeline.pkl")