# Sentiment Analysis with NLP
### Using TF-IDF and Logistic Regression

In [1]:

import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
file_path = "sentiment_tweets3.csv"
df = pd.read_csv('/Users/adityashinde/Desktop/task2/sentiment_tweets.csv')

# Rename columns
df.columns = ["index", "text", "label"]

# Text preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#', '', text)  # Remove mentions and hashtags
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    return text

# Apply preprocessing
df["cleaned_text"] = df["text"].apply(preprocess_text)

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["cleaned_text"], df["label"], test_size=0.2, random_state=42)

# Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}\n")
print("Classification Report:\n", report)


Accuracy: 0.9830

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      1614
           1       1.00      0.92      0.96       449

    accuracy                           0.98      2063
   macro avg       0.99      0.96      0.97      2063
weighted avg       0.98      0.98      0.98      2063

