# Sentiment Analysis of Twitter using NLTK in Python

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))


## Load Dataset

In [None]:

df = pd.read_csv("twitter_sentiment.csv")
print(df.head())
print(df["sentiment"].value_counts())


## Text Cleaning

In [None]:

def clean_tweet(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"@\w+|#\w+", "", text)
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text.strip()

df["clean_tweet"] = df["tweet"].apply(clean_tweet)


## EDA Charts

In [None]:

df["tweet_length"] = df["clean_tweet"].apply(len)
df["word_count"] = df["clean_tweet"].apply(lambda x: len(x.split()))

sns.countplot(x="sentiment", data=df)
plt.title("Target Variable Distribution")
plt.show()

plt.hist(df["tweet_length"], bins=30)
plt.title("Tweet Length Distribution")
plt.show()


## Feature Extraction

In [None]:

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["clean_tweet"])
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Model Training

In [None]:

# Naive Bayes
nb_model = BernoulliNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))

# SVM
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))


## Confusion Matrix

In [None]:

cm = confusion_matrix(y_test, y_pred_lr)
sns.heatmap(cm, annot=True, fmt="d")
plt.title("Logistic Regression Confusion Matrix")
plt.show()

print(classification_report(y_test, y_pred_lr))


## Predict New Tweet Sentiment

In [None]:

def predict_sentiment(text):
    text = clean_tweet(text)
    vector = vectorizer.transform([text])
    return lr_model.predict(vector)[0]

print(predict_sentiment("I love this product!"))
print(predict_sentiment("This is the worst experience ever"))
