# **Build a natural language processing (NLP) model to perform sentiment analysis on social media posts or product reviews..**

In [3]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import twitter_samples
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import re

# Download necessary NLTK data
print("Downloading necessary NLTK data...")
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('twitter_samples')

# 1. Load the dataset
print("Loading dataset...")
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

tweets = positive_tweets + negative_tweets
labels = [1] * len(positive_tweets) + [0] * len(negative_tweets)

# 2. Preprocess the text data
print("Preprocessing text data...")

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove user @ references and '#' from tweet
    text = re.sub(r'\@\w+|\#', '', text)

    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

X_clean = [preprocess_text(tweet) for tweet in tweets]

# 3. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_clean, labels, test_size=0.2, random_state=42)

# 4. Convert text to numerical features using TF-IDF
print("Vectorizing text data...")
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# 5. Train a Naive Bayes classifier
print("Training the model...")
classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)

# 6. Make predictions on the test set
print("Making predictions...")
y_pred = classifier.predict(X_test_vectorized)

# 7. Evaluate the model
print("Evaluating the model...")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# 8. Function to predict sentiment of new text
def predict_sentiment(text):
    cleaned_text = preprocess_text(text)
    vectorized_text = vectorizer.transform([cleaned_text])
    prediction = classifier.predict(vectorized_text)
    return "Positive" if prediction[0] == 1 else "Negative"

# 9. Test the model with some example tweets
print("\nTesting the model with example tweets:")
positive_tweet = "I love this new product! It's amazing and works perfectly."
negative_tweet = "This service is terrible. I've had nothing but problems since day one."

print(f"Positive tweet sentiment: {predict_sentiment(positive_tweet)}")
print(f"Negative tweet sentiment: {predict_sentiment(negative_tweet)}")

Downloading necessary NLTK data...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


Loading dataset...
Preprocessing text data...
Vectorizing text data...
Training the model...
Making predictions...
Evaluating the model...
[[785 203]
 [285 727]]
              precision    recall  f1-score   support

           0       0.73      0.79      0.76       988
           1       0.78      0.72      0.75      1012

    accuracy                           0.76      2000
   macro avg       0.76      0.76      0.76      2000
weighted avg       0.76      0.76      0.76      2000


Testing the model with example tweets:
Positive tweet sentiment: Positive
Negative tweet sentiment: Negative
