In [8]:
import pandas as pd

# Load your dataset
df = pd.read_csv('Reviews Data.csv')

# Assuming you have a 'PRODUCT_CATEGORY' column in your dataset
product_categories = df['PRODUCT_CATEGORY'].unique()

for category in product_categories:
    # Filter data for the current product category
    category_df = df[df['PRODUCT_CATEGORY'] == category]

    train_data = train_data.dropna()
    test_data = test_data.dropna()
    train_data = train_data.fillna("No review available")
    test_data = test_data.fillna("No review available")


    # Now, you can perform sentiment analysis on 'category_df'
    # Use the same steps as before for sentiment analysis, such as tokenization, vectorization, and classification.


In [12]:
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import VotingClassifier
from textblob import TextBlob

# Load your dataset
# Replace 'your_dataset.csv' with the actual file path or URL of your dataset
df = pd.read_csv('Reviews Data.csv')

# Drop rows with missing values in 'REVIEW_CONTENT'
df = df.dropna(subset=['REVIEW_CONTENT'])

# Function to perform sentiment analysis for each product category
def perform_sentiment_analysis(category_df):
    # Assign labels based on sentiment polarity
    category_df['LABEL'] = pd.cut(category_df['REVIEW_CONTENT'].apply(lambda x: TextBlob(str(x)).sentiment.polarity),
                                  bins=[-float('inf'), -0.3, 0.1, 0.4, 0.7, float('inf')],
                                  labels=['Very Negative', 'Negative', 'Neutral', 'Positive', 'Very Positive'],
                                  include_lowest=True)

    # Split the dataset into training and testing sets
    train_data, test_data, train_labels, test_labels = train_test_split(
        category_df['REVIEW_CONTENT'], category_df['LABEL'], test_size=0.2, random_state=42
    )

    # Vectorize the text data using TF-IDF
    tfidf_vectorizer = TfidfVectorizer()
    train_vectors_tfidf = tfidf_vectorizer.fit_transform(train_data)
    test_vectors_tfidf = tfidf_vectorizer.transform(test_data)

    # Define classifiers
    naive_bayes_classifier = MultinomialNB(alpha=1.0)  # You can adjust the alpha value
    random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

    # Create an ensemble of classifiers
    ensemble_classifier = VotingClassifier(estimators=[
        ('naive_bayes', naive_bayes_classifier),
        ('random_forest', random_forest_classifier)
    ], voting='hard')

    # Train the ensemble classifier
    ensemble_classifier.fit(train_vectors_tfidf, train_labels)

    # Make predictions on the test set
    predictions_ensemble = ensemble_classifier.predict(test_vectors_tfidf)

    # Evaluate the ensemble model
    accuracy_ensemble = accuracy_score(test_labels, predictions_ensemble)
    conf_matrix_ensemble = confusion_matrix(test_labels, predictions_ensemble)
    classification_rep_ensemble = classification_report(test_labels, predictions_ensemble)

    # Display the results
    print(f'Ensemble Model Accuracy: {accuracy_ensemble:.2f}')
    print(f'Confusion Matrix (Ensemble):\n{conf_matrix_ensemble}')
    print(f'Classification Report (Ensemble):\n{classification_rep_ensemble}')

    # Create a histogram for sentiment distribution with product category label
    fig = px.histogram(category_df, x='LABEL', title=f'Sentiment Distribution - {category_df["PRODUCT_CATEGORY"].iloc[0]}',
                       labels={'LABEL': 'Sentiment'})
    fig.show()

# Apply sentiment analysis for each product category
df.groupby('PRODUCT_CATEGORY').apply(perform_sentiment_analysis)


Ensemble Model Accuracy: 0.38
Confusion Matrix (Ensemble):
[[1 0 1 0]
 [0 0 1 1]
 [1 0 2 0]
 [0 0 1 0]]
Classification Report (Ensemble):
               precision    recall  f1-score   support

     Negative       0.50      0.50      0.50         2
      Neutral       0.00      0.00      0.00         2
     Positive       0.40      0.67      0.50         3
Very Positive       0.00      0.00      0.00         1

     accuracy                           0.38         8
    macro avg       0.23      0.29      0.25         8
 weighted avg       0.28      0.38      0.31         8




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



Ensemble Model Accuracy: 0.68
Confusion Matrix (Ensemble):
[[ 95  26  15   0]
 [  7 160  53   0]
 [  5  47 311   2]
 [  3   5 129  43]]
Classification Report (Ensemble):
               precision    recall  f1-score   support

     Negative       0.86      0.70      0.77       136
      Neutral       0.67      0.73      0.70       220
     Positive       0.61      0.85      0.71       365
Very Positive       0.96      0.24      0.38       180

     accuracy                           0.68       901
    macro avg       0.78      0.63      0.64       901
 weighted avg       0.73      0.68      0.65       901



Ensemble Model Accuracy: 0.71
Confusion Matrix (Ensemble):
[[ 38   5   3   0   1]
 [  3  41  22   0   1]
 [  4  14 131   0   1]
 [  0   1   0   0   0]
 [  3   2  40   0  39]]
Classification Report (Ensemble):
               precision    recall  f1-score   support

     Negative       0.79      0.81      0.80        47
      Neutral       0.65      0.61      0.63        67
     Positive       0.67      0.87      0.76       150
Very Negative       0.00      0.00      0.00         1
Very Positive       0.93      0.46      0.62        84

     accuracy                           0.71       349
    macro avg       0.61      0.55      0.56       349
 weighted avg       0.74      0.71      0.70       349




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



Ensemble Model Accuracy: 0.74
Confusion Matrix (Ensemble):
[[261  73  31   0   1]
 [ 14 433 128   0   0]
 [  7 121 920   0   0]
 [  0   1   3   0   0]
 [  4  17 241   0 224]]
Classification Report (Ensemble):
               precision    recall  f1-score   support

     Negative       0.91      0.71      0.80       366
      Neutral       0.67      0.75      0.71       575
     Positive       0.70      0.88      0.78      1048
Very Negative       0.00      0.00      0.00         4
Very Positive       1.00      0.46      0.63       486

     accuracy                           0.74      2479
    macro avg       0.65      0.56      0.58      2479
 weighted avg       0.78      0.74      0.73      2479




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



Ensemble Model Accuracy: 0.72
Confusion Matrix (Ensemble):
[[126  46  15   0   0]
 [  7 207  62   0   1]
 [  5  74 428   0   1]
 [  0   2   2   0   0]
 [  3  10 112   0 106]]
Classification Report (Ensemble):
               precision    recall  f1-score   support

     Negative       0.89      0.67      0.77       187
      Neutral       0.61      0.75      0.67       277
     Positive       0.69      0.84      0.76       508
Very Negative       0.00      0.00      0.00         4
Very Positive       0.98      0.46      0.63       231

     accuracy                           0.72      1207
    macro avg       0.64      0.54      0.57      1207
 weighted avg       0.76      0.72      0.71      1207




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



Ensemble Model Accuracy: 0.70
Confusion Matrix (Ensemble):
[[ 38   4  10   0]
 [  3  23  19   1]
 [  0   3 104   1]
 [  1   0  42  30]]
Classification Report (Ensemble):
               precision    recall  f1-score   support

     Negative       0.90      0.73      0.81        52
      Neutral       0.77      0.50      0.61        46
     Positive       0.59      0.96      0.73       108
Very Positive       0.94      0.41      0.57        73

     accuracy                           0.70       279
    macro avg       0.80      0.65      0.68       279
 weighted avg       0.77      0.70      0.68       279



Ensemble Model Accuracy: 0.36
Confusion Matrix (Ensemble):
[[0 1 0 0]
 [0 3 0 0]
 [1 3 0 0]
 [0 1 1 1]]
Classification Report (Ensemble):
               precision    recall  f1-score   support

     Negative       0.00      0.00      0.00         1
      Neutral       0.38      1.00      0.55         3
     Positive       0.00      0.00      0.00         4
Very Positive       1.00      0.33      0.50         3

     accuracy                           0.36        11
    macro avg       0.34      0.33      0.26        11
 weighted avg       0.38      0.36      0.29        11



Ensemble Model Accuracy: 0.71
Confusion Matrix (Ensemble):
[[ 77  41  12   0   0]
 [  8 233  32   0   0]
 [ 10  88 379   0   0]
 [  1   0   0   0   0]
 [  1  17 115   0 109]]
Classification Report (Ensemble):
               precision    recall  f1-score   support

     Negative       0.79      0.59      0.68       130
      Neutral       0.61      0.85      0.71       273
     Positive       0.70      0.79      0.75       477
Very Negative       0.00      0.00      0.00         1
Very Positive       1.00      0.45      0.62       242

     accuracy                           0.71      1123
    macro avg       0.62      0.54      0.55      1123
 weighted avg       0.76      0.71      0.70      1123




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

