Thêm vào trong file của thầy chạy nhé

In [1]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np

def extract_features_scaled(tweet, freqs, scaler=None, fit_scaler=False):
    """
    Lấy đặc trưng cho 1 tweet và scale (chỉ scale cột 1 & 2).
    """
    word_l = process_tweet(tweet)
    x = np.zeros((1, 3))
    x[0, 0] = 1  # bias term

    for word in word_l:
        if (word, 1) in freqs:
            x[0, 1] += freqs[(word, 1)]
        if (word, 0) in freqs:
            x[0, 2] += freqs[(word, 0)]

    if scaler is not None:
        features = x[:, 1:3]  # chỉ scale 2 cột pos, neg
        if fit_scaler:
            scaled = scaler.fit_transform(features)
        else:
            scaled = scaler.transform(features)
        x[:, 1:3] = scaled

    return x


In [None]:
# --- STANDARD SCALER ---
print("=== Training with StandardScaler ===")

scaler_std = StandardScaler()

# Build X_train_scaled
X_train_std = np.zeros((len(train_x), 3))
for i, tweet in enumerate(train_x):
    X_train_std[i, :] = extract_features_scaled(tweet, freqs, scaler_std, fit_scaler=(i == 0))

# Train logistic regression
J_std, w_std = gradient_descent_logistic(X_train_std, train_y, np.zeros((3, 1)), 1e-7, 10000)

# Build X_test_scaled
X_test_std = np.zeros((len(test_x), 3))
for i, tweet in enumerate(test_x):
    X_test_std[i, :] = extract_features_scaled(tweet, freqs, scaler_std, fit_scaler=False)

# Predict & evaluate
y_pred_std = (sigmoid(np.dot(X_test_std, w_std)) > 0.5).astype(int)
accuracy_std = np.mean(y_pred_std == np.squeeze(test_y))

print(f"Cost (StandardScaler): {J_std:.6f}")
print(f"Accuracy (StandardScaler): {accuracy_std:.4f}")


In [None]:
# --- MIN-MAX SCALER ---
print("\n=== Training with MinMaxScaler ===")

scaler_mm = MinMaxScaler(feature_range=(0, 1))

# Build X_train_scaled
X_train_mm = np.zeros((len(train_x), 3))
for i, tweet in enumerate(train_x):
    X_train_mm[i, :] = extract_features_scaled(tweet, freqs, scaler_mm, fit_scaler=(i == 0))

# Train logistic regression
J_mm, w_mm = gradient_descent_logistic(X_train_mm, train_y, np.zeros((3, 1)), 1e-7, 10000)

# Build X_test_scaled
X_test_mm = np.zeros((len(test_x), 3))
for i, tweet in enumerate(test_x):
    X_test_mm[i, :] = extract_features_scaled(tweet, freqs, scaler_mm, fit_scaler=False)

# Predict & evaluate
y_pred_mm = (sigmoid(np.dot(X_test_mm, w_mm)) > 0.5).astype(int)
accuracy_mm = np.mean(y_pred_mm == np.squeeze(test_y))

print(f"Cost (MinMaxScaler): {J_mm:.6f}")
print(f"Accuracy (MinMaxScaler): {accuracy_mm:.4f}")


Scaling helps features have the same scale, avoiding gradient descent moving too fast or too slow on each dimension → helping the model converge more stably.

StandardScaler is often suitable when the data has a near-normal distribution (mean ≈ center, not many outliers).

MinMaxScaler is good when we want to keep the data distribution in [0, 1], but is easily affected by outliers (pull max/min far away).

For this sentiment analysis problem, StandardScaler gives the best results, because the frequency of occurrence from the word is almost normally distributed and does not have too large outliers.