In [58]:
print("Cloud Computing")

Cloud Computing


In [59]:
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB

In [61]:
# Download the IMDb movie review dataset
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [62]:
# Load the IMDb movie review dataset
def load_imdb_dataset():
    # Read positive reviews
    positive_reviews = nltk.corpus.movie_reviews.fileids('pos')
    positive_reviews = [nltk.corpus.movie_reviews.raw(fileid) for fileid in positive_reviews]
    
    # Read negative reviews
    negative_reviews = nltk.corpus.movie_reviews.fileids('neg')
    negative_reviews = [nltk.corpus.movie_reviews.raw(fileid) for fileid in negative_reviews]
    
    # Create the dataset
    dataset = [(review, 'positive') for review in positive_reviews] + [(review, 'negative') for review in negative_reviews]
    return dataset

In [63]:
# Preprocess the text data
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords and punctuations
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    
    return tokens

In [64]:
# Extract features from the dataset
def extract_features(dataset):
    features = []
    labels = []
    
    for review, sentiment in dataset:
        tokens = preprocess_text(review)
        features.append(tokens)
        labels.append(sentiment)
    
    return features, labels

In [65]:
# Train the Naive Bayes classifier
def train_classifier(features, labels):
    # Create the vocabulary
    vocabulary = set()
    for tokens in features:
        vocabulary.update(tokens)
    
    # Convert the vocabulary to a list
    vocabulary = list(vocabulary)
    
    # Create the feature vectors
    feature_vectors = []
    for tokens in features:
        vector = [1 if token in tokens else 0 for token in vocabulary]
        feature_vectors.append(vector)
    
    # Convert the feature vectors and labels to NumPy arrays
    feature_vectors = np.array(feature_vectors)
    labels = np.array(labels)
    
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(feature_vectors, labels, test_size=0.2, random_state=42)
    
    # Train the classifier
    classifier = MultinomialNB()
    classifier.fit(X_train, y_train)
    
    return classifier, X_test, y_test

In [66]:
def predict_sentiment(classifier, vocabulary, new_reviews):
    tokens = [preprocess_text(review) for review in new_reviews]

    feature_vectors = []
    for tokens in tokens:
        vector = [1 if token in tokens else 0 for token in vocabulary]
        feature_vectors.append(vector)

    feature_vectors = np.array(feature_vectors)

    predictions = classifier.predict(feature_vectors)

    return predictions

In [67]:
# Load the IMDb dataset
dataset = load_imdb_dataset()

In [68]:
# Shuffle the dataset
random.shuffle(dataset)

In [69]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [70]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [71]:
# Extract features and labels
features, labels = extract_features(dataset)

In [72]:
# Train the classifier
classifier, X_test, y_test = train_classifier(features, labels)

In [73]:
# Evaluate the classifier
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.825
