<a href="https://colab.research.google.com/github/Ananyapanyala/Ananyapanyala/blob/main/SENTIMENTAL_ANALYSIS(MICROPROJECT).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Download the movie_reviews corpus if not already present
nltk.download('movie_reviews')
nltk.download('punkt')

# Prepare the dataset
documents = [(list(word_tokenize(movie_reviews.raw(fileid))), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the documents randomly to ensure proper training and testing split
random.shuffle(documents)

# Feature extraction and preparation
all_words = [word.lower() for review, _ in documents for word in review]
all_words = nltk.FreqDist(all_words)

# Choose a certain number of top words as features
top_words = 3000
word_features = list(all_words.keys())[:top_words]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features[word] = (word in document_words)
    return features

# Convert text data into numerical features
featuresets = [(document_features(review), category) for (review, category) in documents]

# Split the data into training and testing sets
train_set, test_set = train_test_split(featuresets, test_size=0.2, random_state=42)

# Train the Naive Bayes classifier
classifier = MultinomialNB()

X_train, y_train = zip(*train_set)
X_train = [dict(review) for review in X_train]  # Convert to list of dictionaries

# Convert the features to a numeric representation
vectorizer = DictVectorizer(sparse=False)
X_train = vectorizer.fit_transform(X_train)

classifier.fit(X_train, y_train)

# Test the classifier
X_test, y_test = zip(*test_set)
X_test = [dict(review) for review in X_test]  # Convert to list of dictionaries

# Convert the test features to a numeric representation
X_test = vectorizer.transform(X_test)

predictions = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, predictions))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, predictions))


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Accuracy: 0.80

Classification Report:
              precision    recall  f1-score   support

         neg       0.78      0.83      0.80       195
         pos       0.83      0.77      0.80       205

    accuracy                           0.80       400
   macro avg       0.80      0.80      0.80       400
weighted avg       0.80      0.80      0.80       400


Confusion Matrix:
[[162  33]
 [ 47 158]]
