In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
from sklearn.feature_extraction.text import CountVectorizer

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk

import pandas as pd
import numpy as np

# Miscellaneous
import string
from typing import List,Dict



In [2]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/dan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/dan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
data = pd.read_csv('twitter_training.csv', encoding='latin-1')

training_data = data.iloc[:,2:]
training_data.columns = ['Sentiment', 'Tweet']
training_data.dropna(inplace=True)
training_data = training_data[training_data['Sentiment'].isin(['Positive', 'Negative'])]
training_data['SentimentBinary'] = training_data['Sentiment'].apply(lambda x: 1 if x == 'Positive' else 0)
training_data.reset_index(drop=True,inplace=True) # Reset annoying index

positive_training_data = training_data[training_data['Sentiment'] == 'Positive']
negative_training_data = training_data[training_data['Sentiment'] == 'Negative']



In [4]:
training_data['SentimentBinary'][0]

1

In [5]:
# Tweet Pre-Processing
def process_tweet(tweet:str):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    tweet = tweet.lower() # Convert to lower case
    tweet = word_tokenize(tweet,language='english')
    tweet = [word for word in tweet if word not in string.punctuation] # Removes punctuation
    tweet = [word for word in tweet if word not in stop_words] # Removes stopwords
    tweet = [stemmer.stem(word) for word in tweet] # Converts word to its stem
    return tweet

In [6]:
def build_freqs(data:pd.DataFrame):
    freqs = {}
    for i,tweet in enumerate(data['Tweet']):
        for word in process_tweet(tweet):
            pair = (word,data['SentimentBinary'][i])
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
            
    return freqs

In [7]:
# Get Frequence for tweet
freqs = build_freqs(training_data)

In [38]:
# Extract tweet features

def extract_features(tweet:str,freqs:Dict):
    x = np.zeros((1,3))
    # Set bias to 1
    x[:,0] = 1
    w_list = process_tweet(tweet)
    for word in w_list:
        for key,value in freqs.items():
            if (word,1) == key:
                x[:,1] += value
            if (word,0) == key:
                x[:,2] += value
    assert(x.shape == (1,3))
    return x

In [None]:
# x_train = np.zeros((len(training_data),3))
x_train = np.vstack([extract_features(tweet, freqs) for tweet in training_data['Tweet']])
y_train = np.array(training_data['SentimentBinary'])
# This thing took like 15 minutes, makes me appreciate ther power vectorized implementations

In [27]:
x_train.shape[0] == len(y_train)

True

In [28]:
# Training
model = LogisticRegression()
model.fit(x_train,y_train)

In [23]:
data2 = pd.read_csv('twitter_validation.csv', encoding='latin-1')

test_data = data2.iloc[:,2:]
test_data.columns = ['Sentiment', 'Tweet']
test_data.dropna(inplace=True)
test_data = training_data[training_data['Sentiment'].isin(['Positive', 'Negative'])]
test_data['SentimentBinary'] = training_data['Sentiment'].apply(lambda x: 1 if x == 'Positive' else 0)
test_data.reset_index(drop=True,inplace=True) # Reset annoying index

positive_test_data = training_data[training_data['Sentiment'] == 'Positive']
negative_test_data = training_data[training_data['Sentiment'] == 'Negative']

In [24]:
freq2 = build_freqs(test_data)
x_test = np.vstack([extract_features(tweet, freq2) for tweet in test_data['Tweet']])
y_test = np.array(test_data['SentimentBinary'])

In [30]:
y_predict = model.predict(x_test)
print("Model Accuracy: ",accuracy_score(y_test,y_predict))
print("Model Precision: ",precision_score(y_test,y_predict))

Model Accuracy:  0.7618804054682414
Model Precision:  0.7458443520967133


In [None]:
# Vectorized Solution
vectorizer = CountVectorizer()
x_train_vectorized = vectorizer.fit_transform(training_data['Tweet'])
x_test_vectorized = vectorizer.transform(test_data['Tweet'])

model2 = LogisticRegression()
model2.fit(x_train_vectorized,y_train)

y_predict2 = model2.predict(x_test_vectorized)
print("Model Accuracy: ",accuracy_score(y_test,y_predict2))
print("Model Precision: ",precision_score(y_test,y_predict2))

# The vectorized solution has better validation accuracy than the non-vectorized solution

Model Accuracy:  0.9503859388077746
Model Precision:  0.9434441145484149


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
def make_prediction_non_vectorized(tweet:str,freqs:Dict):
    x = extract_features(tweet,freqs)
    return model.predict(x)[0]
sample_tweet = input("Enter a tweet: ")
print("Sentiment: ", "Positive" if make_prediction_non_vectorized(sample_tweet,freqs) == 1 else "Negative")


Sentiment:  Positive


In [48]:
def make_prediction_vectorized(tweet):
    x = vectorizer.transform([tweet])
    return model2.predict(x)[0]
sample_tweet = input("Enter a tweet: ")
print("Sentiment: ", "Positive" if make_prediction_vectorized(sample_tweet) == 1 else "Negative")

Sentiment:  Positive
