In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Load the dataset
df = pd.read_csv("twitter_training.csv")
df.drop(columns=['no', 'topic'], inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(keep='first', inplace=True)

# Map labels to numerical values
df['label'] = df['label'].map({'Positive': 1, 'Negative': 2, 'Neutral': 3, 'Irrelevant': 4})

# Define preprocessing function
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocessing(text):
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.isalnum() and word.lower() not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to text column
df['text'] = df['text'].apply(preprocessing)

# Split data into features and labels
X = df['text']
y = df['label']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

# Vectorize text data using TF-IDF
tfidf = TfidfVectorizer(ngram_range=(1, 2))  # Using bigrams
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

lr = LogisticRegression()


print(f"Training...")
lr.fit(X_train_tfidf, y_train)
y_pred = lr.predict(X_test_tfidf)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy : {acc}")
print(f"Confusion Matrix :\n{confusion_matrix(y_test, y_pred)}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SVI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SVI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SVI\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Training...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy : 0.8729396588791745
Confusion Matrix :
[[3435  180  162   64]
 [ 166 3927  182   35]
 [ 202  201 2883   63]
 [ 206  178  134 1936]]


In [12]:
user_input = input("Enter the text for sentiment analysis: ")

# Preprocess the user input
processed_input = preprocessing(user_input)

# Vectorize the processed input using TF-IDF
input_vectorized = tfidf.transform([processed_input])

# Predict the sentiment label using the Logistic Regression model
predicted_label = lr.predict(input_vectorized)[0]

# Map numerical label back to original sentiment
sentiment_mapping = {1: 'Positive', 2: 'Negative', 3: 'Neutral', 4: 'Irrelevant'}
predicted_sentiment = sentiment_mapping[predicted_label]

print(f"The predicted sentiment for the input text is: {predicted_sentiment}")

The predicted sentiment for the input text is: Negative
