# TP2

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import sentiwordnet as swn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import os
import random



In [2]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('sentiwordnet')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kiouloueleonor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kiouloueleonor/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/kiouloueleonor/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kiouloueleonor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Path to the directory where you have extracted the dataset
dataset_directory = "/Users/kiouloueleonor/Documents/Ecole/A5/Machine_Learning_for_NLP/TD/TP2/review_polarity/txt_sentoken"
# Subdirectories for positive and negative reviews
pos_dir = os.path.join(dataset_directory, 'pos')
neg_dir = os.path.join(dataset_directory, 'neg')

In [4]:
def load_reviews(directory, label):
    #Load reviews from a given directory and assign a label
    reviews = []
    for filename in os.listdir(directory):
        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
            text = file.read().strip()
            reviews.append((text, label))
    return reviews

In [5]:
# Load positive and negative reviews
positive_reviews = load_reviews(pos_dir, 'positive')
negative_reviews = load_reviews(neg_dir, 'negative')

In [6]:
# Combine reviews
all_reviews = positive_reviews + negative_reviews

In [7]:
# Shuffle the dataset
random.shuffle(all_reviews)

In [8]:
# Tokenize and part-of-speech tag each review
tagged_reviews = [(word_tokenize(review), label) for review, label in all_reviews]
tagged_reviews = [(nltk.pos_tag(tokens), label) for tokens, label in tagged_reviews]

In [9]:
def extract_adverbs(tagged_tokens):
    #Extract adverbs from a list of tagged tokens
    return [word for word, pos in tagged_tokens if pos.startswith('RB')]

In [10]:
# Extract adverbs for each review
adverbs_in_reviews = [(extract_adverbs(tagged_tokens), label) for tagged_tokens, label in tagged_reviews]

In [11]:
def get_sentiment(adverb):
    #Get sentiment score for an adverb using SentiWordNet
    synsets = list(swn.senti_synsets(adverb, 'r'))  # 'r' for adverbs
    if not synsets:
        return 0  # No score if adverb is not found in SentiWordNet
    
    # Use the first synset by default (could be improved using disambiguation methods)
    return synsets[0].pos_score() - synsets[0].neg_score()

In [12]:
# Calculate sentiment score for each adverb in the reviews
sentiments_in_reviews = [(sum(get_sentiment(adverb) for adverb in adverbs), label) for adverbs, label in adverbs_in_reviews]

In [13]:
# Classify reviews based on sentiment scores
def classify_review(sum_score):
    return "pos" if sum_score > 0 else "neg"

predicted_labels = [classify_review(score) for score, _ in sentiments_in_reviews]

In [14]:
# Calculate classification accuracy
actual_labels = [label for _, label in sentiments_in_reviews]
correctly_classified = sum(1 for predicted, actual in zip(predicted_labels, actual_labels) if predicted == actual)

accuracy = correctly_classified / len(predicted_labels)

print(f"Classification Accuracy: {accuracy * 100:.2f}%")

Classification Accuracy: 0.00%


In [15]:
# Use TF-IDF to represent reviews numerically
all_reviews_text = [review[0] for review in all_reviews]
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(all_reviews_text)

In [16]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, actual_labels, test_size=0.2, random_state=42)

In [17]:
# Train RandomForestClassifier model
classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(X_train, y_train)

In [18]:
# Predict on the test set
y_pred = classifier.predict(X_test)

In [19]:
# Calculate accuracy and display the classification report
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")

report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(report)

Accuracy: 80.00%

Classification Report:
              precision    recall  f1-score   support

    negative       0.76      0.86      0.81       198
    positive       0.85      0.74      0.79       202

    accuracy                           0.80       400
   macro avg       0.80      0.80      0.80       400
weighted avg       0.81      0.80      0.80       400

