In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import MaxentClassifier
from nltk.classify import accuracy

# Download NLTK resources if not already present
nltk.download('punkt')
nltk.download('stopwords')

# Sample dataset - you should replace this with your corpus dataset
corpus = [
    ("I love this movie", "positive"),
    ("This film is great", "positive"),
    ("That movie was terrible", "negative"),
    ("I didn't like the acting", "negative"),
    ("The plot was boring", "negative"),
    ("The story was engaging", "positive")
]

# Feature extraction function
def document_features(document):
    words = set(document)
    features = {}
    for word in words:
        features['contains({})'.format(word)] = True
    return features

# Tokenization and stop word removal
stop_words = set(stopwords.words('english'))
processed_corpus = []
for (text, label) in corpus:
    words = [word.lower() for word in word_tokenize(text) if word.isalnum() and word.lower() not in stop_words]
    processed_corpus.append((words, label))

# Feature extraction
featuresets = [(document_features(d), c) for (d,c) in processed_corpus]

# Splitting into training and testing sets
train_set, test_set = featuresets[:int(len(featuresets)*0.8)], featuresets[int(len(featuresets)*0.8):]

# Training the MaxEnt model
classifier = MaxentClassifier.train(train_set, algorithm='gis', trace=0, max_iter=10)

# Evaluate the model
print("Accuracy:", accuracy(classifier, test_set))


Accuracy: 0.5


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import nltk
from nltk.corpus import movie_reviews
from nltk.classify import MaxentClassifier
from nltk.tokenize import word_tokenize
from nltk import FreqDist

# Prepare data
nltk.download('movie_reviews')
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Define feature extractor function
def document_features(document):
    words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in words)
    return features

# Create feature set
all_words = FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]
featuresets = [(document_features(d), c) for (d,c) in documents]

# Split data into train and test sets
train_set, test_set = featuresets[100:], featuresets[:100]

# Train the MaxEnt classifier
classifier = MaxentClassifier.train(train_set, algorithm='GIS', max_iter=10)

# Evaluate the classifier
print(nltk.classify.accuracy(classifier, test_set))

# Show most informative features
classifier.show_most_informative_features(10)

# Classify some example sentences
sentence = "This movie is great"
tokens = word_tokenize(sentence)
features = document_features(tokens)
print("Classification:", classifier.classify(features))


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


  ==> Training (10 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.526
             2          -0.68779        0.526
             3          -0.68523        0.526
             4          -0.68269        0.526
             5          -0.68017        0.526
             6          -0.67766        0.526
             7          -0.67518        0.526
             8          -0.67271        0.530
             9          -0.67027        0.538
         Final          -0.66784        0.552
0.04
  -0.012 contains(mulan)==True and label is 'neg'
  -0.012 contains(outstanding)==True and label is 'neg'
  -0.011 contains(seagal)==True and label is 'pos'
  -0.009 contains(flynt)==True and label is 'neg'
  -0.009 contains(wonderfully)==True and label is 'neg'
  -0.009 contains(damon)==True and label is 'neg'
  -0.008 contains(jedi)==True and label is 'neg'
  -0.008 contains(lame)==True and label is 'pos'
