In [3]:
import numpy as np
import nltk
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('twitter_samples')

# Load the Twitter Samples dataset
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

# Preprocess the data
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_data(data):
    processed_data = []
    for tweet in data:
        tweet = word_tokenize(tweet)
        tweet = ' '.join([lemmatizer.lemmatize(word.lower()) for word in tweet if word.isalpha()])
        tweet = ' '.join([word for word in tweet.split() if word not in stop_words])
        processed_data.append(tweet)
    return processed_data

positive_tweets = preprocess_data(positive_tweets)
negative_tweets = preprocess_data(negative_tweets)

# Combine positive and negative tweets and create labels
tweets = positive_tweets + negative_tweets
labels = np.concatenate((np.ones(len(positive_tweets)), np.zeros(len(negative_tweets))))

# Convert tweets to numerical feature vectors
vectorizer = CountVectorizer(max_features=5000)
feature_vectors = vectorizer.fit_transform(tweets).toarray()

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(feature_vectors, labels, test_size=0.2, random_state=42)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


Accuracy: 0.745
