# Import Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_extraction.text import  TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, classification_report


# Load the dataset

In [2]:
df = pd.read_csv("../datasets/training_dataset/Preprocessed_Restuarant_Dataset.csv")
df.head()

Unnamed: 0,Review_ID,Sentence_ID,Text,Token_Text,Opinion_Target,Opinion_Category,Opinion_Polarity
0,1004293,1004293:0,judging from previous posts this used to be a ...,judging previous post used good place longer,place,RESTAURANT#GENERAL,negative
1,1004293,1004293:1,we there were four of us arrived at noon the p...,four arrived noon place empty staff acted like...,staff,SERVICE#GENERAL,negative
2,1004293,1004293:2,they never brought us complimentary noodles ig...,never brought complimentary noodle ignored rep...,service,SERVICE#GENERAL,negative
3,1004293,1004293:3,the food was lousy too sweet or too salty and ...,food lousy sweet salty portion tiny,food,FOOD#QUALITY,negative
4,1004293,1004293:3,the food was lousy too sweet or too salty and ...,food lousy sweet salty portion tiny,portions,FOOD#STYLE_OPTIONS,negative


# Split the data into training and testing sets

In [3]:
X = df['Token_Text']
y_polarity = df['Opinion_Polarity']

X_train, X_test, y_polarity_train, y_polarity_test = train_test_split(X, y_polarity, test_size=0.2, random_state=42) 

# Text Vectorization techniques

In [4]:
# Define a list of text vectorization techniques
vectorization_techniques = [
    ("TF-IDF", TfidfVectorizer(max_features=5000, sublinear_tf=True, stop_words='english')),
    ("Count Vectorization (BoW)", CountVectorizer(max_features=5000, stop_words='english'))
]

# ML Classifiers

In [5]:
# Define a list of classifiers
classifiers = {
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=10, criterion="entropy"),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2),
    "Logistic Regression": LogisticRegression(random_state=0),
    "Decision Tree": DecisionTreeClassifier(criterion='entropy', random_state=0),
    "AdaBoost": AdaBoostClassifier(n_estimators=50, random_state=0),
    "Bagging": BaggingClassifier(n_estimators=50, random_state=0),
    "Extra Trees": ExtraTreesClassifier(n_estimators=50, criterion="entropy", random_state=0),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3),
}

# Comparing different classifers with different vectorization techniques to predict Opinion_Polarity

In [6]:
# Comparing different classifers with different vectorization techniques
# to predict Opinion_Polarity

results = []
# Iterate through vectorization techniques and classifiers
for technique_name, vectorizer in vectorization_techniques:
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)

    for classifier_name, classifier in classifiers.items():
        classifier.fit(X_train_vectorized, y_polarity_train)
        y_polarity_pred = classifier.predict(X_test_vectorized)
        polarity_accuracy = accuracy_score(y_polarity_test, y_polarity_pred)
        if classifier_name =="Support Vector Machine":
            vectorc = classifier

        result = {
            "Technique": technique_name,
            "Classifier": classifier_name,
            "Accuracy": polarity_accuracy
        }

        results.append(result)

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Pivot the DataFrame to form the table
pivot_table = pd.pivot_table(results_df, values='Accuracy', 
                            index='Classifier', columns='Technique')

# Fill NaN values with a placeholder (e.g., "N/A")
pivot_table = pivot_table.fillna("N/A")

# Display the result table
print("Results Table:\n")
print(pivot_table)


Results Table:

Technique               Count Vectorization (BoW)  TF-IDF
Classifier                                               
AdaBoost                                   0.7775  0.7725
Bagging                                    0.8400  0.8325
Decision Tree                              0.8625  0.8500
Extra Trees                                0.8725  0.8875
Gradient Boosting                          0.8025  0.8050
K-Nearest Neighbors                        0.7125  0.5525
Logistic Regression                        0.8800  0.8675
Naive Bayes                                0.8600  0.8675
Random Forest                              0.8450  0.8375
Support Vector Machine                     0.8875  0.8950


# Training  SVM classifier with TF-IDF vectorization

In [7]:
# Text vectorization using TF-IDF
tfidf_vectorizer_SA = TfidfVectorizer(max_features=5000, sublinear_tf=True, stop_words='english')
X_train_tfidf = tfidf_vectorizer_SA.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer_SA.transform(X_test)

In [8]:
# Train a SVM classifier 
svm_sentiment_classifier = SVC()
svm_sentiment_classifier.fit(X_train_tfidf, y_polarity_train)

# Save the trained TF-IDF vectorizer and SVM model

In [9]:
import pickle

# Save the trained TF-IDF vectorizer
with open("../models/tfidf_vectorizer_SA.pkl", "wb") as f:
    pickle.dump(tfidf_vectorizer_SA, f)

# Save the trained SVM model
with open("../models/svm_sentiment_classifier.pkl", "wb") as f:
    pickle.dump(svm_sentiment_classifier, f)

# Predicting Opinion Polarity

In [10]:
# Predictions with the best classifier (Support Vector Classifier)
y_polarity_pred = svm_sentiment_classifier.predict(X_test_tfidf)

# Opinion Category Classification Report:

In [11]:

# Calculate accuracy and print classification report
polarity_accuracy = accuracy_score(y_polarity_test, y_polarity_pred)
polarity_report = classification_report(y_polarity_test, y_polarity_pred)

print(f"Opinion Polarity Accuracy : {polarity_accuracy}")
print("\nOpinion Polarity Classification Report:")
print(polarity_report)


Opinion Polarity Accuracy : 0.895

Opinion Polarity Classification Report:
              precision    recall  f1-score   support

    negative       0.85      0.87      0.86       150
    positive       0.92      0.91      0.92       250

    accuracy                           0.90       400
   macro avg       0.89      0.89      0.89       400
weighted avg       0.90      0.90      0.90       400

