In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    # Handle non-string values, like NaN
    if not isinstance(text, str):
        return ""  # Or any suitable default value

    # Remove special characters
    text = re.sub(r'\W', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stop words and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df = pd.read_csv('/content/Twitter_Data.csv')
df = df.dropna()

# Split the data into training and validation sets
training_data, validation_data = train_test_split(df, test_size=0.2, random_state=71)

columns = ["content", "sentiment"]
training_data.columns = columns
validation_data.columns = columns

# Preprocess the text data for both training and validation sets
training_data['processed_text'] = training_data['content'].apply(preprocess)
validation_data['processed_text'] = validation_data['content'].apply(preprocess)

# Feature extraction using Bag of Words
vectorizer = CountVectorizer(max_features=10000)
X_train = vectorizer.fit_transform(training_data['processed_text'])
X_val = vectorizer.transform(validation_data['processed_text'])

# Scale the data (optional for Bag of Words, but can sometimes help)
scaler = StandardScaler(with_mean=False)
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Extract sentiment labels
y_train = training_data['sentiment']
y_val = validation_data['sentiment']

# Train the model
model = LogisticRegression(solver='liblinear', max_iter=200)
model.fit(X_train, y_train)

# Make predictions on validation data
y_pred = model.predict(X_val)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))