In [None]:
import nltk
import random
from nltk.corpus import twitter_samples
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.classify.util import accuracy as nltk_accuracy

# Download the NLTK data
nltk.download('twitter_samples')
nltk.download('stopwords')
nltk.download('punkt')
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

import re
import string
stopwords_english = stopwords.words('english')
stemmer = PorterStemmer()

def preprocess(tweet):
    # Remove stock market tickers like $GE
    tweet = re.sub(r'\$\w+', '', tweet)

    # Remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)

    # Remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

    # Remove hashtags (only removing the hash # sign from the word)
    tweet = re.sub(r'#', '', tweet)

    # Tokenize the tweet
    tweet_tokens = word_tokenize(tweet)

    # Remove stopwords and stemming
    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and word not in string.punctuation):
            stem_word = stemmer.stem(word)
            tweets_clean.append(stem_word)

    return tweets_clean

# Preprocess the positive and negative tweets
positive_tweets_clean = []
negative_tweets_clean = []

for tweet in positive_tweets:
    positive_tweets_clean.append(preprocess(tweet))

for tweet in negative_tweets:
    negative_tweets_clean.append(preprocess(tweet))

def get_tweets_for_model(cleaned_tweets):
    for tweet_tokens in cleaned_tweets:
        yield dict([token, True] for token in tweet_tokens)

positive_tweets_model = get_tweets_for_model(positive_tweets_clean)
negative_tweets_model = get_tweets_for_model(negative_tweets_clean)

# Split the dataset into train and test sets
positive_dataset = [(tweet_dict, 'Positive') for tweet_dict in positive_tweets_model]
negative_dataset = [(tweet_dict, 'Negative') for tweet_dict in negative_tweets_model]

dataset = positive_dataset + negative_dataset
random.shuffle(dataset)
train_data = dataset[:7000]
test_data = dataset[7000:]

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

def tokens_to_string(tweet_tokens):
    return ' '.join(tweet_tokens)

train_data_str = [tokens_to_string(tweet_dict) for (tweet_dict, _) in train_data]
test_data_str = [tokens_to_string(tweet_dict) for (tweet_dict, _) in test_data]

train_labels = [label for (_, label) in train_data]
test_labels = [label for (_, label) in test_data]

# Convert tokenized tweets to strings
train_data_strings = [' '.join(tokens) for tokens in positive_tweets_clean + negative_tweets_clean]

# Vectorize the tweets using the CountVectorizer
vectorizer = CountVectorizer(max_features=5000, min_df=5, max_df=0.7)
X_train = vectorizer.fit_transform(train_data_str).toarray()
X_test = vectorizer.transform(test_data_str).toarray()

# Train the RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100, random_state=0)
classifier.fit(X_train, train_labels)

# Predict the labels for the test set
predictions = classifier.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)

# Test the classifier with a test string
test_string = "I love this place, it's amazing!"
test_string_clean = preprocess(test_string)
test_string_vector = vectorizer.transform([' '.join(test_string_clean)]).toarray()
result = classifier.predict(test_string_vector)
print("Test string result:", result[0])


Accuracy: 0.7346666666666667
Test string result: Positive
