# Import Libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_extraction.text import  TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, classification_report


# Load the dataset

In [3]:

df = pd.read_csv("datasets/training_dataset/Preprocessed_Restuarant_Dataset.csv")
df.head()

Unnamed: 0,Review_ID,Sentence_ID,Text,Token_Text,Opinion_Target,Opinion_Category,Opinion_Polarity
0,1004293,1004293:0,judging from previous posts this used to be a ...,judging previous post used good place longer,place,RESTAURANT#GENERAL,negative
1,1004293,1004293:1,we there were four of us arrived at noon the p...,four arrived noon place empty staff acted like...,staff,SERVICE#GENERAL,negative
2,1004293,1004293:2,they never brought us complimentary noodles ig...,never brought complimentary noodle ignored rep...,service,SERVICE#GENERAL,negative
3,1004293,1004293:3,the food was lousy too sweet or too salty and ...,food lousy sweet salty portion tiny,food,FOOD#QUALITY,negative
4,1004293,1004293:3,the food was lousy too sweet or too salty and ...,food lousy sweet salty portion tiny,portions,FOOD#STYLE_OPTIONS,negative


# Split the data into training and testing sets

In [4]:
X1 = df['Opinion_Target']
y_category = df['Opinion_Category']

X1_train, X1_test, y_category_train, y_category_test = train_test_split(
                                                              X1, y_category, test_size=0.2, random_state=42)


# Vectorization techniques

In [5]:
# Define a list of vectorization techniques and their names
vectorization_techniques1 = [
    ("TF-IDF", TfidfVectorizer(max_features=5000, sublinear_tf=True, stop_words='english')),
    ("Count Vectorization (BoW)", CountVectorizer(max_features=5000, stop_words='english'))
]


# ML Classifiers

In [6]:
# Define a list of classifiers
classifiers1 = {
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=10, criterion="entropy"),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2),
    "Logistic Regression": LogisticRegression(random_state=0),
    "Decision Tree": DecisionTreeClassifier(criterion='entropy', random_state=0),
    "AdaBoost": AdaBoostClassifier(n_estimators=50, random_state=0),
    "Bagging": BaggingClassifier(n_estimators=50, random_state=0),
    "Extra Trees": ExtraTreesClassifier(n_estimators=50, criterion="entropy", random_state=0),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3),
}


# Comparing different classifers with different vectorization techniques to predict Opinion_Category

In [7]:

results = []
# Iterate through vectorization techniques and classifiers
for technique_name, vectorizer in vectorization_techniques1:
    X1_train_vectorized = vectorizer.fit_transform(X1_train)
    X1_test_vectorized = vectorizer.transform(X1_test)

    for classifier_name, classifier in classifiers1.items():
        classifier.fit(X1_train_vectorized, y_category_train)
        y_category_pred = classifier.predict(X1_test_vectorized)
        category_accuracy = accuracy_score(y_category_test, y_category_pred)

        result = {
            "Technique": technique_name,
            "Classifier": classifier_name,
            "Accuracy": category_accuracy
        }

        results.append(result)

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Pivot the DataFrame to form the result table
pivot_table = pd.pivot_table(results_df, values='Accuracy',
                                index='Classifier', columns='Technique')

# Fill NaN values with a placeholder (e.g., "N/A")
pivot_table = pivot_table.fillna("N/A")



# Display the result table
print("Results Table:\n")
print(pivot_table)


Results Table:

Technique               Count Vectorization (BoW)  TF-IDF
Classifier                                               
AdaBoost                                   0.6000  0.6000
Bagging                                    0.7450  0.7525
Decision Tree                              0.7400  0.7450
Extra Trees                                0.7525  0.7575
Gradient Boosting                          0.7500  0.7575
K-Nearest Neighbors                        0.6475  0.7350
Logistic Regression                        0.7300  0.7300
Naive Bayes                                0.7300  0.7175
Random Forest                              0.7550  0.7525
Support Vector Machine                     0.7575  0.7600


# Training  SVM classifier with TF-IDF vectorization

In [8]:
# # To ignore any warning while runtime
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
# warnings.filterwarnings("ignore", category=RuntimeWarning)

# Text vectorization using TF-IDF
tfidf_vectorizer_AE = TfidfVectorizer(max_features=5000, sublinear_tf=True, stop_words='english')
X1_train_tfidf = tfidf_vectorizer_AE.fit_transform(X1_train)
X1_test_tfidf = tfidf_vectorizer_AE.transform(X1_test)

In [9]:
# Training  SVM classifier
SVM_opinion_category_classifier = SVC()
SVM_opinion_category_classifier.fit(X1_train_tfidf, y_category_train)


# Save the trained TF-IDF vectorizer and SVM model

In [11]:
import pickle

# Save the trained TF-IDF vectorizer
with open("pickle_files/tfidf_vectorizer_AE.pkl", "wb") as f:
    pickle.dump(tfidf_vectorizer_AE, f)

# Save the trained SVM model
with open("pickle_files/SVM_opinion_category_classifier.pkl", "wb") as f:
    pickle.dump(SVM_opinion_category_classifier, f)

# Predicting Opinion Category

In [12]:
# Predictions
y_category_pred = SVM_opinion_category_classifier.predict(X1_test_tfidf)

# Opinion Category Classification Report:

In [None]:
# Calculate accuracy and print classification report
category_accuracy = accuracy_score(y_category_test, y_category_pred)
category_report = classification_report(y_category_test, y_category_pred)

print(f"Opinion Category Accuracy : {category_accuracy}")
print("\nOpinion Category Classification Report:")
print(category_report)
   

Opinion Category Accuracy : 0.76

Opinion Category Classification Report:
                          precision    recall  f1-score   support

        AMBIENCE#GENERAL       0.97      0.64      0.77        44
           DRINKS#PRICES       0.00      0.00      0.00         4
          DRINKS#QUALITY       0.55      0.60      0.57        10
    DRINKS#STYLE_OPTIONS       1.00      0.38      0.55         8
             FOOD#PRICES       0.00      0.00      0.00        14
            FOOD#QUALITY       0.72      0.99      0.83       136
      FOOD#STYLE_OPTIONS       0.56      0.25      0.34        20
        LOCATION#GENERAL       1.00      0.80      0.89         5
      RESTAURANT#GENERAL       0.65      0.85      0.74        62
RESTAURANT#MISCELLANEOUS       0.50      0.07      0.12        14
       RESTAURANT#PRICES       0.00      0.00      0.00        10
         SERVICE#GENERAL       0.97      0.96      0.97        73

                accuracy                           0.76       400
