In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Positive and negative words
positive_words = ['good', 'great', 'happy', 'awesome', 'excellent', 'positive', 'joyful']
negative_words = ['bad', 'sad', 'terrible', 'awful', 'negative', 'poor', 'unhappy']

# Preprocess function
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text into words
    words = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.isalnum() and word not in stop_words]
    return words

def extract_features(text):
    words = preprocess_text(text)
    positive_count = sum(1 for word in words if word in positive_words)
    negative_count = sum(1 for word in words if word in negative_words)
    return [positive_count, negative_count]

# Sample labeled documents
documents = [
    ('I am very happy today', 'positive'),
    ('This is a terrible experience', 'negative'),
    ('I am feeling great', 'positive'),
    ('This movie is awful', 'negative'),
    ('Such a positive vibe', 'positive'),
    ('I hate this product', 'negative')
]

# Extract features and labels
X = [extract_features(doc[0]) for doc in documents]
y = [doc[1] for doc in documents]

# Convert string labels to numeric labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict the sentiment of test documents
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Function to predict sentiment for a new document
def predict_sentiment(text):
    features = extract_features(text)
    prediction = model.predict([features])
    sentiment = label_encoder.inverse_transform(prediction)
    return sentiment[0]

# Test with a new document
new_document = "I am not satisfied with the service"
print(f"Sentiment: {predict_sentiment(new_document)}")


Accuracy: 1.00
Sentiment: negative


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True