In [37]:
import re
import pandas as pd
import nltk
import random
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [38]:
# Function to preprocess text, including handling emojis
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters, numbers, and URLs
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s\U0001F300-\U0001F5FF\U0001F600-\U0001F64F\U0001F680-\U0001F6FF]', '', text)
    # Tokenize text using TweetTokenizer
    tokenizer = TweetTokenizer()
    words = tokenizer.tokenize(text)
    # Remove stopwords
    stop_words = set(nltk.corpus.stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatize words
    lemmatizer = nltk.stem.WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [39]:
# Read the Twitter tweet dataset from the CSV file
df = pd.read_csv('merged_file.csv')

In [40]:
# Shuffle the dataset for randomization
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Preprocess the data
df['text'] = df['text'].apply(preprocess_text)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size=0.2, random_state=42)

# Convert text data to TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a Multinomial Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7034596375617792


In [41]:
# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

    negative       1.00      0.02      0.04        50
     neutral       0.67      0.89      0.77       315
    positive       0.76      0.60      0.67       242

    accuracy                           0.70       607
   macro avg       0.81      0.50      0.49       607
weighted avg       0.74      0.70      0.67       607



In [51]:
# Test the model with new data
new_tweets = ["This post was Great!","how are you doing?","you are amazing",]
preprocessed_new_tweets = [preprocess_text(tweet) for tweet in new_tweets]
tfidf_new_tweets = tfidf_vectorizer.transform(preprocessed_new_tweets)
predicted_sentiments = classifier.predict(tfidf_new_tweets)

print("Predicted sentiments for new tweets:")
for tweet, sentiment in zip(new_tweets, predicted_sentiments):
    print(f"- Tweet: {tweet}")
    print(f"  Sentiment: {sentiment}\n")

Predicted sentiments for new tweets:
- Tweet: This post was Great!
  Sentiment: positive

- Tweet: how are you doing?
  Sentiment: neutral

- Tweet: you are amazing
  Sentiment: positive

