In [5]:
!pip install nltk




In [None]:
# Import necessary libraries
import nltk  # Natural Language Toolkit
from nltk.tokenize import word_tokenize  # To split sentences into words
from nltk.corpus import stopwords  # To remove common words like "the", "is"
from nltk.probability import FreqDist  # To calculate frequency of words
from sklearn.model_selection import train_test_split  # To split data into training and testing sets
from sklearn.feature_extraction.text import CountVectorizer  # Convert text to numerical data
from sklearn.naive_bayes import MultinomialNB  # A simple classifier for sentiment analysis
from sklearn.metrics import accuracy_score, classification_report  # To evaluate the model

# Step 1: Download required NLTK resources
nltk.download('punkt')  # Tokenizer for splitting text into words
nltk.download('stopwords')  # Pre-defined list of common stopwords

nltk.download()

# Step 2: Create a small dataset
# Dataset with sentences labeled as positive or negative
data = [
    ("I love this product, it is amazing!", "positive"),
    ("This is the worst experience I've ever had.", "negative"),
    ("I feel great using this app!", "positive"),
    ("The service was horrible and disappointing.", "negative"),
    ("What a fantastic experience, I am very happy!", "positive"),
    ("I hate this, it's a complete waste of time.", "negative"),
]

# Step 3: Preprocessing function
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    stop_words = set(stopwords.words('english'))  # Get English stopwords
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return " ".join(filtered_tokens)  # Return cleaned text as a single string

# Apply preprocessing to the dataset
cleaned_data = [(preprocess_text(sentence), label) for sentence, label in data]

# Step 4: Prepare data for training
# Split the sentences and their labels
sentences, labels = zip(*cleaned_data)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.3, random_state=42)

# Step 5: Convert text data to numerical format
# CountVectorizer converts text into a matrix of word counts
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)  # Fit and transform training data
X_test_vectorized = vectorizer.transform(X_test)  # Only transform testing data

# Step 6: Train a Naive Bayes classifier
classifier = MultinomialNB()  # Initialize the classifier
classifier.fit(X_train_vectorized, y_train)  # Train the model

# Step 7: Evaluate the model
# Predict sentiment for test data
y_pred = classifier.predict(X_test_vectorized)

# Print accuracy and detailed report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Step 8: Test with new sentences
new_sentences = ["I absolutely love this!", "This is terrible and awful."]
new_sentences_cleaned = [preprocess_text(sentence) for sentence in new_sentences]
new_sentences_vectorized = vectorizer.transform(new_sentences_cleaned)
predictions = classifier.predict(new_sentences_vectorized)

# Display results
for sentence, sentiment in zip(new_sentences, predictions):
    print(f"'{sentence}' -> Sentiment: {sentiment}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
