In [29]:
#Imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import json

### Loading Dataset

In [12]:
#Loading the final cleaned dataset before embeddings and PCA
df = pd.read_pickle("../data/processed/customer_data_final.pkl")

# Check dataset
print("Shape:", df.shape)
print(df[['clean_text', 'final_sentiment']].head())

Shape: (1345727, 6)
                                          clean_text final_sentiment
0                  and how do you propose we do that         neutral
1  i have sent several private messages and no on...        negative
2                      is the worst customer service        negative
3  you gonna magically change your connectivity f...         neutral
4          since i signed up with you....since day 1         neutral


### TF - IDF Vectorizer

In [16]:
#Defining TF-IDF Vectorizer
tfidf = TfidfVectorizer(lowercase=True, stop_words='english', max_features=10000, ngram_range=(1, 2))

#Fiting and transforming the text
X_tfidf = tfidf.fit_transform(df["clean_text"])

#Getting the labels
y = df["final_sentiment"]

print("TF-IDF matrix shape:", X_tfidf.shape)
print("Example label:", y.iloc[0])

TF-IDF matrix shape: (1345727, 10000)
Example label: neutral


### Logistic Regression

In [32]:
#Logistic Regretion 80/20 split
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, stratify=y, random_state=42
)

#Training Logistic Regression
logreg_raw = LogisticRegression(class_weight="balanced", max_iter=1000, random_state=42)
logreg_raw.fit(X_train, y_train)

#Predicting
y_pred_logreg_raw = logreg_raw.predict(X_test)

#Evaluating
report_logreg_raw = classification_report(y_test, y_pred_logreg_raw, output_dict=True)
print(classification_report(y_test, y_pred_logreg_raw))

              precision    recall  f1-score   support

    negative       0.90      0.74      0.81    153371
     neutral       0.56      0.77      0.65     65563
    positive       0.71      0.76      0.73     50212

    accuracy                           0.75    269146
   macro avg       0.72      0.76      0.73    269146
weighted avg       0.78      0.75      0.76    269146



In [24]:
#Saving classification report
with open("../results/logreg/raw_text/logreg_tfidf_results_80_20.json", "w") as f:
    json.dump(report_logreg_raw, f, indent=4)

#Confusion matrix
labels = ["negative", "neutral", "positive"]
cm = confusion_matrix(y_test, y_pred_logreg_raw, labels=labels)

#Saving image
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap="Blues")
plt.title("Logistic Regression TF-IDF Confusion Matrix (80/20 Split)")
plt.savefig("../results/logreg/raw_text/logreg_tfidf_confusion_matrix_80_20.png")
plt.close()

#Saving CSV
pd.DataFrame(cm, index=labels, columns=labels).to_csv("../results/logreg/raw_text/logreg_tfidf_confusion_matrix_80_20.csv")
print("TF-IDF Logistic Regression results saved.")

TF-IDF Logistic Regression results saved.


####  Logistic Regression with Raw Text Features
The fact that the Logistic Regression technique was also effectively leveraged with raw text instead of PCA reduced embedding also proved to be extremely helpful. As demonstrated, the model had better and more reliable accuracy levels across the folds, indicating that simpler delivery indicators such as bag-of-words or TF-IDF can be very competent when pared with interpretable models. This further proves that feature structure and preprocessing amount to more concern than model complexity in a few cases.

### Support Verctor Machines (SVM)

In [27]:
#SVM using SGD 80/20 split
svm_tfidf = SGDClassifier(loss='hinge', class_weight='balanced', max_iter=1000, tol=1e-3, random_state=42)

#Training
svm_tfidf.fit(X_train, y_train)

#Predicting
y_pred_svm_tfidf = svm_tfidf.predict(X_test)

#Evaluating
from sklearn.metrics import classification_report
report_svm_tfidf = classification_report(y_test, y_pred_svm_tfidf, output_dict=True)
print(classification_report(y_test, y_pred_svm_tfidf))

              precision    recall  f1-score   support

    negative       0.81      0.85      0.83    153371
     neutral       0.62      0.56      0.59     65563
    positive       0.72      0.71      0.71     50212

    accuracy                           0.75    269146
   macro avg       0.72      0.70      0.71    269146
weighted avg       0.75      0.75      0.75    269146



In [28]:
#Saving classification report
with open("../results/svm/raw_text/svm_tfidf_results_80_20.json", "w") as f:
    json.dump(report_svm_tfidf, f, indent=4)

#Confusion matrix
cm = confusion_matrix(y_test, y_pred_svm_tfidf, labels=labels)

#Saving image
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap="Blues")
plt.title("SVM TF-IDF Confusion Matrix (80/20 Split)")
plt.savefig("../results/svm/raw_text/svm_tfidf_confusion_matrix_80_20.png")
plt.close()

#Saving CSV
pd.DataFrame(cm, index=labels, columns=labels).to_csv("../results/svm/raw_text/svm_tfidf_confusion_matrix_80_20.csv")
print("TF-IDF SVM results saved.")

TF-IDF SVM results saved.


### SVM with Raw Text Features
The raw texts on the vectors features gave significantly better outcomes than reduced BERT embeddings on PCA. This demonstrates the significance of having feature format on the linear models such as SVM, the refinement indicates that although embeddings hold a lot of deep semantic information, they should not be organised in such a manner that does not coincide with the capabilities of the model. The findings suggest that the classical models are capable of a competitive performance in text classification tasks as long as they are vectorised properly as an exemple  using TF-IDF vectoriser.

### Random Forest

In [30]:
#Random Forest 80/20 split
rf_tfidf = RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42, n_jobs=-1)

#Training
rf_tfidf.fit(X_train, y_train)

#Predicting
y_pred_rf_tfidf = rf_tfidf.predict(X_test)

#Evaluating
report_rf_tfidf = classification_report(y_test, y_pred_rf_tfidf, output_dict=True)
print(classification_report(y_test, y_pred_rf_tfidf))

              precision    recall  f1-score   support

    negative       0.80      0.86      0.83    153371
     neutral       0.63      0.55      0.59     65563
    positive       0.75      0.68      0.71     50212

    accuracy                           0.75    269146
   macro avg       0.73      0.70      0.71    269146
weighted avg       0.75      0.75      0.75    269146



In [31]:
#Saving classification report
with open("../results/rf/raw_text/rf_tfidf_results_80_20.json", "w") as f:
    json.dump(report_rf_tfidf, f, indent=4)

#Confusion matrix
cm = confusion_matrix(y_test, y_pred_rf_tfidf, labels=labels)

#Saving image
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap="Blues")
plt.title("Random Forest TF-IDF Confusion Matrix (80/20 Split)")
plt.savefig("../results/rf/raw_text/rf_tfidf_confusion_matrix_80_20.png")
plt.close()

#Saving CSV
pd.DataFrame(cm, index=labels, columns=labels).to_csv("../results/rf/raw_text/rf_tfidf_confusion_matrix_80_20.csv")
print("TF-IDF Random Forest results saved.")

TF-IDF Random Forest results saved.


#### Random Forest with Raw Text Features
Random Forest gave the best result in terms of raw text features amongst the traditional models. The non-linear aspect of the model was useful in grasping the pattern and interaction among the frequencies of words better than it would do with the PCA compressed features. This further supports the notion that good vectorisation is capable of providing simpler models with much needed signal that could translate to good performances when dealing with raw texts.

### Conclusion
In conclusion this notebook thus illustrated a sharp increase in the performance of all the traditional Machine Learning models when the model was trained on the vectorised raw text features rather than the PCA reduced embeddings. This arrangement worked much more effectively with SVM, Logistic Regression and, more importantly, Random Forest. The outcome confirms the decision to use direct text vectorisation of the models which are not based on sequence data. This was also a crucial reference point, demonstrating that conventional ML models will not die out and indeed any model can be effective with structured input features in place.