In [1]:
import pandas as pd

# Assuming the files are uploaded directly to the main directory in Colab
file_names = ['PetBed1_Updated.csv', 'PetBed2_Updated.csv', 'Updated_52in1.csv', 'Updated_ToolSet1.csv']
dataframes = {}

for file_name in file_names:
    # Load each file and store it in a dictionary with its name as the key
    dataframes[file_name] = pd.read_csv('/content/' + file_name)

# Example: Display the first few rows of the 'PetBed1.csv' dataframe
print(dataframes['PetBed1_Updated.csv'].head())

    #         Reivew ID   Author Name  Rating  \
0   1  1704840000000000        tr**ey       0   
1   4  1705050000000000       pa***32       0   
2   9  1703180000000000       22***77       0   
3  11  1703350000000000  Teresa Shaub       0   
4  12  1703840000000000       mi***13       0   

                                        Content           Date  
0                                           NaN   3/30/24 5:54  
1                                         nice    3/30/24 0:16  
2            fits great on my bad glad i got it  3/29/24 12:33  
3  awesome thanks easy to put on happy purchase   3/29/24 1:19  
4                                   works great  3/28/24 23:05  


In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('stopwords')

file_paths = ['PetBed1_Updated.csv', 'PetBed2_Updated.csv', 'Updated_52in1.csv', 'Updated_ToolSet1.csv']

# Function to preprocess text
def preprocess(text):
    if isinstance(text, str):
        stop_words = set(stopwords.words('english'))
        tokens = word_tokenize(text)
        return ' '.join([w.lower() for w in tokens if w.isalpha() and w.lower() not in stop_words])
    else:
        return ""  # Return an empty string for non-string input

# Process each file
for file_path in file_paths:
    # Load data
    df = pd.read_csv(file_path)

    # Preprocessing
    df['processed_content'] = df['Content'].apply(preprocess)

    # Feature Extraction
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df['processed_content'])
    y = df['Rating']  # Use the 'Rating' column directly as it's already classified

    # Splitting the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

    # Building and training the classifier
    clf = MultinomialNB()
    clf.fit(X_train, y_train)

    # Get probabilities for calculating ROC AUC
    y_probs = clf.predict_proba(X_test)[:, 1]  # probabilities for the positive class (flipped in labels)

    # Evaluating the classifier
    y_pred = clf.predict(X_test)
    print(f"Results for {file_path}:")
    print(classification_report(y_test, y_pred))
    print("ROC AUC:", roc_auc_score(y_test, y_probs))  # Calculate and print the ROC AUC

    # Identify top features for negative reviews
    feature_names = np.array(vectorizer.get_feature_names_out())
    log_probs = clf.feature_log_prob_
    top_negative_words = feature_names[np.argsort(log_probs[1])[::-1][:10]]
    print("Top 10 keywords predicting Negative rating for " + file_path + ":", top_negative_words)
    print("\n")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Results for PetBed1_Updated.csv:
              precision    recall  f1-score   support

           0       0.97      0.92      0.94        37
           1       0.70      0.88      0.78         8

    accuracy                           0.91        45
   macro avg       0.84      0.90      0.86        45
weighted avg       0.92      0.91      0.91        45

ROC AUC: 0.8682432432432432
Top 10 keywords predicting Negative rating for PetBed1_Updated.csv: ['waterproof' 'bed' 'water' 'noise' 'like' 'proof' 'work' 'back' 'good'
 'thin']


Results for PetBed2_Updated.csv:
              precision    recall  f1-score   support

           0       0.84      0.92      0.87        72
           1       0.45      0.28      0.34        18

    accuracy                           0.79        90
   macro avg       0.64      0.60      0.61        90
weighted avg       0.76      0.79      0.77        90

ROC AUC: 0.7885802469135803
Top 10 keywords predicting Negative rating for PetBed2_Updated.csv: ['sma