## Reviews preprocessing: remove stop words and apply stemming using NLTK.

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pandas
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Download NLTK data files
#nltk.download('stopwords')
#nltk.download('punkt_tab')

# Initialize the stemmer and stop words
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Load the dataset
data = pandas.read_csv('data/amazon_reviews.csv')

# Function to preprocess the reviews
def preprocess_review(review):
    tokens = nltk.word_tokenize(review)
    
    filtered_tokens = []
    for token in tokens:
        if token.isalpha() and (token.lower() not in stop_words):
            stemmed_token = stemmer.stem(token.lower())
            filtered_tokens.append(stemmed_token)
    
    return " ".join(filtered_tokens)

# Apply the preprocessing function to the reviews
data['cleaned_review'] = data['cleaned_review'].astype(str).apply(preprocess_review)

## Labels (positive, negative, neutral) mapping into numerical labels.

In [7]:
encoder = LabelEncoder()
data['sentiments'] = encoder.fit_transform(data['sentiments'])

## Data splitting into 80% training and 20% testing.

In [8]:
features = data['cleaned_review'].values
targets = data['sentiments'].values

size_of_training = int(len(data) * 0.8)

features_training = features[:size_of_training].copy()
features_testing = features[size_of_training:].copy()

targets_training = targets[:size_of_training].copy()
targets_testing = targets[size_of_training:].copy()

## Apply TF-IDF vectorizer to the reviews dataset to get the feature vectors corresponding to the reviews using sklearn.

In [9]:
vectorizer = TfidfVectorizer()
features_training_vectorized = vectorizer.fit_transform(features_training)
features_testing_vectorized = vectorizer.transform(features_testing)

## Apply three models of sklearn (SVM, logistic regression and Naïve Bayes).

### SVM

In [10]:
svm_model = svm.SVC()
svm_model.fit(features_training_vectorized, targets_training)

### Logistic Regression

In [11]:
logistic_model = LogisticRegression()
logistic_model.fit(features_training_vectorized, targets_training)

### Naïve Bayes

In [12]:
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(features_training_vectorized, targets_training)

## Output the classification report for each model.

In [13]:
# SVM
predictions_svm = svm_model.predict(features_testing_vectorized)
print("SVM Classification Report:")
print(classification_report(targets_testing, predictions_svm, target_names=encoder.classes_))
# Logistic Regression
predictions_logistic = logistic_model.predict(features_testing_vectorized)
print("Logistic Regression Classification Report:")
print(classification_report(targets_testing, predictions_logistic, target_names=encoder.classes_))
# Naïve Bayes
predictions_naive_bayes = naive_bayes_model.predict(features_testing_vectorized)
print("Naïve Bayes Classification Report:")
print(classification_report(targets_testing, predictions_naive_bayes, target_names=encoder.classes_))

SVM Classification Report:
              precision    recall  f1-score   support

    negative       0.58      0.15      0.24       143
     neutral       0.70      0.82      0.76       908
    positive       0.93      0.91      0.92      2417

    accuracy                           0.86      3468
   macro avg       0.74      0.63      0.64      3468
weighted avg       0.86      0.86      0.85      3468

Logistic Regression Classification Report:
              precision    recall  f1-score   support

    negative       0.67      0.22      0.34       143
     neutral       0.68      0.83      0.75       908
    positive       0.93      0.89      0.91      2417

    accuracy                           0.85      3468
   macro avg       0.76      0.65      0.66      3468
weighted avg       0.85      0.85      0.84      3468

Naïve Bayes Classification Report:
              precision    recall  f1-score   support

    negative       1.00      0.01      0.01       143
     neutral       0.57 

## Allow the user to enter a new review, apply vectorization then predict the label of the new review.

In [16]:
def predict_review(review):
    cleaned_review = preprocess_review(review)
    vectorized_review = vectorizer.transform([cleaned_review])
    prediction = svm_model.predict(vectorized_review)
    sentiment = encoder.inverse_transform(prediction)
    
    return sentiment[0]

# Example usage
new_review = input("Enter a new review: ")
predicted_sentiment = predict_review(new_review)
print(f"The predicted sentiment for the review '{new_review}' is: {predicted_sentiment}")

The predicted sentiment for the review 'i hate this product but i like the packaging' is: neutral
