In [198]:
import os
from glob import glob
import numpy
import re
import nltk

In [199]:
# loading the data, initializing empty lists of train and test data to store once they are read from the files.

# Data handling process, loading and preparing dataset 
data_path = "C:\\Users\\ashri\\Documents\\AIT-526\\tweet\\tweet"
train_data = []
train_labels = []
test_data = []
test_labels = []

In [200]:
## Organizing data for training and testing.
## In here filepath is used for joining the base path. Later, it iterates over the two types of sentimetns within each subset.
## If statement cehecks if the subset id being processed or not, if so hten it appends the content of file to train_data and it not it appends to test data.

for subset in ["train", "test"]:
    subset_path = os.path.join(data_path, subset)
    for label in ["positive", "negative"]:
        label_path = os.path.join(subset_path, label)
        for filepath in glob(os.path.join(label_path, "*.txt")):
            with open(filepath, "r", encoding="utf-8") as f:
                text = f.read()
                if subset == "train":
                    train_data.append(text)
                    train_labels.append("pos" if label == "positive" else "neg")
                else:
                    test_data.append(text)
                    test_labels.append("pos" if label == "positive" else "neg")

In [201]:
train_data[:5], train_labels[:5]

(['@SouthwestAir I would appreciate that.  Thank you.\n',
  '@USAirways thank you very much.\n',
  "@JetBlue I'm all set. About to fly. Not bad for a first date with a giant metal bird machine. She even brought snacks.\n",
  '@SouthwestAir I got a flight at 11:55am on Thursday but looking for something tomorrow anything available?\n',
  "@AmericanAir you're my early frontrunner for best airline! #oscars2016\n"],
 ['pos', 'pos', 'pos', 'pos', 'pos'])

In [202]:
train_data[2000:2004], train_labels[2002:2004]

(['@SouthwestAir wifi on my plane but I gotta pay for it? Help your broke homegirl out✈️📱\n',
  "@united we're stuck at OGG looks like flight will be Cancelled Flightled. Can you help? =)\n",
  '@united WTH be honest with your customers.  This better be the last change or we are driving home.  Has our plane left or not!\n',
  '@united Freakin"\n'],
 ['neg', 'neg'])

In [203]:
print(len(train_data))
print(train_data[1])

4181
@USAirways thank you very much.



In [204]:
from nltk.stem import SnowballStemmer, PorterStemmer
from bs4 import BeautifulSoup
import emoji
from nltk.tokenize import WordPunctTokenizer
from collections import defaultdict
from collections import Counter

In [205]:
# store tweets in a list
preprocessed_tweets = []

#stemmer = PorterStemmer()

def preprocess(tweet, stem=False, stemmer_type = 'porter'):
    # removing HTML tags
    soup = BeautifulSoup(tweet, "html.parser")
    tweet = soup.get_text()
    
    #lowere case words starting with capital letter
    tweet = re.sub(r'\b([A-Z][a-z]+)\b', lambda m: m.group(0).lower(), tweet)
    
    # trabslaye emojis to text
    tweet = emoji.demojize(tweet)
    
    #Tokenize using WordPunctTokenizer to remove the whitespaces
    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(tweet)
    # replace unnecessary punctuations with whitespaces
    
    tweet = re.sub(r'[^\w\s]', ' ', tweet)
    
    #tokens = word_tokenize(sample_tweet)
    preprocessed_tweets = ' '.join(tokens)
    
    #stemming
    if stem:
        if stemmer_type == 'porter':
            stemmer = PorterStemmer()
        elif stemmer_type == 'snowball':
            stemmer = SnowballStemmer('english')
        else:
            raise ValueError("invalid")
            #tokens = [stemmer.stem(token) for token in tokens]

        preprocessed_tweets = ' '.join([stemmer.stem(token) for token in preprocessed_tweets.split()])
    preprocessed_tweet = ' '.join(tokens)
    return preprocessed_tweets


In [206]:
## If stem is True then it applies stemming based on the specified type by reducing word to its root form.
# Assuming preprocessed_tweets is a list of preprocessed tweet texts
preprocessed_tweets = [preprocess(tweet, stem=True) for tweet in train_data]

# Print the preprocessed text of the first 5 tweets
for i, preprocessed_text in enumerate(preprocessed_tweets[:5]):  # Limiting to first 5 for demonstration
    print(f"Preprocessed Tweet {i+1}: {preprocessed_text}\n")


  soup = BeautifulSoup(tweet, "html.parser")


Preprocessed Tweet 1: @ southwestair i would appreci that . thank you .

Preprocessed Tweet 2: @ usairway thank you veri much .

Preprocessed Tweet 3: @ jetblu i ' m all set . about to fli . not bad for a first date with a giant metal bird machin . she even brought snack .

Preprocessed Tweet 4: @ southwestair i got a flight at 11 : 55am on thursday but look for someth tomorrow anyth avail ?

Preprocessed Tweet 5: @ americanair you ' re my earli frontrunn for best airlin ! # oscars2016



In [207]:

def create_vocabularies(preprocessed_tweets, stemming= False):
    positive_words = set()
    negative_words = set()

    # Loop through each preprocessed tweet
    for tweet in preprocessed_tweets:
        words = tweet.split()
        
        # Update positive or negative word sets based on certain keywords
        if 'good' in words or 'positive' in words:
            positive_words.update(words)
        elif 'bad' in words or 'negative' in words:
            negative_words.update(words)

    # Create vocabularies with binary sentiment values
    positive_vocabulary = {word: 1 for word in positive_words}
    negative_vocabulary = {word: 0 for word in negative_words}

    return positive_vocabulary, negative_vocabulary

In [208]:
# Assuming preprocessed_tweets is defined
#preprocessed_tweets = [...]  # Your preprocessed tweets here

# Generate positive and negative vocabularies
positive_vocabulary, negative_vocabulary = create_vocabularies(preprocessed_tweets, stemming=False)

print(list(positive_vocabulary.items())[1:10])

print(list(negative_vocabulary.items())[1:10])

[('agent', 1), ('look', 1), ('garbag', 1), ('sylvi', 1), ('poteettj', 1), ('took', 1), ('have', 1), ('nc0es6e4lf', 1), ('day', 1)]
[('agent', 0), ('look', 0), ('bicycl', 0), ('rant', 0), ('tag', 0), ('took', 0), ('day', 0), ('have', 0), ('socialtantrum', 0)]


In [227]:
def train_nb(train_data, train_labels):
    positive_words = defaultdict(int)
    negative_words = defaultdict(int)
    n_pos = n_neg = 0

    # Instead of using a separate vocabulary creation function,
    # directly count word occurrences in positive and negative tweets
    for text, label in zip(train_data, train_labels):
        tokens = preprocess(text, stem=True)
        if label == 'pos':
            n_pos += 1
            for token in tokens:
                positive_words[token] += 1
        else:
            n_neg += 1
            for token in tokens:
                negative_words[token] += 1

    # Combine positive and negative words to form a complete vocabulary
    vocab = set(positive_words.keys()).union(set(negative_words.keys()))
    n_total = n_pos + n_neg
    prior_pos = n_pos / n_total
    prior_neg = n_neg / n_total

    # Calculate likelihoods using word counts
    pos_likelihoods = {word: (positive_words[word] + 1) / (sum(positive_words.values()) + len(vocab)) for word in vocab}
    neg_likelihoods = {word: (negative_words[word] + 1) / (sum(negative_words.values()) + len(vocab)) for word in vocab}
    
    return vocab, prior_pos, prior_neg, pos_likelihoods, neg_likelihoods


In [228]:
# Classify text
def classify_naive_bayes(text, vocab, p_pos, p_neg, pos_likelihoods, neg_likelihoods):
    tokens = preprocess(text, stem=True)
    
    p_pos_text = prior_pos
    p_neg_text = prior_neg
    
    for token in tokens:
        if token in vocab:
            p_pos_text *= pos_likelihoods[token]
            p_neg_text *= neg_likelihoods[token]
        else:
            # Ignore words not in the vocabulary
            pass
    
    return 'pos' if p_pos_text > p_neg_text else 'neg'

In [229]:
def calculate_confusion_matrix(predicted, actual):
    confusion_matrix = defaultdict(int)
    
    for pred, act in zip(predicted, actual):
        confusion_matrix[(pred, act)] += 1
    
    return confusion_matrix


In [230]:
def calculate_accuracy(confusion_matrix):
    total = sum(confusion_matrix.values())
    correct = confusion_matrix[('pos', 'pos')] + confusion_matrix[('neg', 'neg')]
    return correct / total

def calculate_precision(confusion_matrix, class_index):
    true_pos = confusion_matrix.get((class_index, class_index), 0)
    false_neg = sum(count for (pred, act), count in confusion_matrix.items() if pred != class_index and act == class_index)
    return true_pos / (true_pos + false_neg) if (true_pos + false_neg) > 0 else 0


def calculate_recall(confusion_matrix, class_label):
    true_pos = confusion_matrix.get((class_label, class_label),0)
    false_neg = sum(count for (pred, act), count in confusion_matrix.items() if pred != class_label and act == class_label)
    return true_pos / (true_pos + false_neg)

def calculate_f1_score(precision, recall):
    return 2 * precision * recall / (precision + recall)


In [231]:
vocab, prior_pos, prior_neg, pos_likelihoods, neg_likelihoods = train_nb(train_data, train_labels)


  soup = BeautifulSoup(tweet, "html.parser")


In [233]:
predictions = []
for text in test_data:
    prediction = classify_naive_bayes(text, vocab, prior_pos, prior_neg, pos_likelihoods, neg_likelihoods)
    predictions.append(prediction)

  soup = BeautifulSoup(tweet, "html.parser")


In [234]:
confusion_matrix = calculate_confusion_matrix(predictions, test_labels)
accuracy = calculate_accuracy(confusion_matrix)
#print(accuracy)
precision_pos = calculate_precision(confusion_matrix, 'pos')
precision_neg = calculate_precision(confusion_matrix, 'neg')
recall_pos = calculate_recall(confusion_matrix, 'pos')
recall_neg = calculate_recall(confusion_matrix, 'neg')
f1_pos = calculate_f1_score(precision_pos, recall_pos)
f1_neg = calculate_f1_score(precision_neg, recall_neg)

In [235]:
print(f"Accuracy: {accuracy*100:.2f}%")
print(f"Precision (Positive): {precision_pos:.2f}")
print(f"Precision (Negative): {precision_neg:.2f}")
print(f"Recall (Positive): {recall_pos:.2f}")
print(f"Recall (Negative): {recall_neg:.2f}")
print(f"F1-Score (Positive): {f1_pos:.2f}")
print(f"F1-Score (Negative): {f1_neg:.2f}")
print("Confusion Matrix:")
print(confusion_matrix)

Accuracy: 78.62%
Precision (Positive): 0.51
Precision (Negative): 0.90
Recall (Positive): 0.51
Recall (Negative): 0.90
F1-Score (Positive): 0.51
F1-Score (Negative): 0.90
Confusion Matrix:
defaultdict(<class 'int'>, {('pos', 'pos'): 599, ('neg', 'pos'): 583, ('neg', 'neg'): 2689, ('pos', 'neg'): 311})


In [236]:
# save the file :
outputs = f""" 
model Performance: Navie Bayes
Accuracy: {accuracy*100:.2f}%
Precision (Positive): {precision_pos:.2f}
Precision (Negative): {precision_neg:.2f}
Recall (Positive): {recall_pos:.2f}
Recall (Negative): {recall_neg:.2f}
F1-Score (Positive): {f1_pos:.2f}
F1-Score (Negative): {f1_neg:.2f}
Confusion Matrix:{confusion_matrix}
"""

# Replace 'your_directory_path' with your actual directory path and choose a suitable filename
filename = "C:\\Users\\ashri\\Documents\\AIT-526\\tweet\\ouputs\\model_performance.txt"

# Write to the file
with open(filename, "w") as file:
    file.write(output_content)

print(f"Output saved to {filename}")


Output saved to C:\Users\ashri\Documents\AIT-526\tweet\ouputs\model_performance.txt


#### Bonus point: 
how would the results change if you used term frequency x inverse document frequency instead of binary representation for Naïve Bayes?  
How do your results change if you regularize your logistic regression?

In [237]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [238]:
#tf idf
tf_idf = TfidfVectorizer()
#applying tf idf to training data
X_train_tf = tf_idf.fit_transform(train_data)
X_train_tf = tf_idf.transform(train_data)

In [239]:
print("n_samples: %d, n_features: %d" % X_train_tf.shape)

n_samples: 4181, n_features: 7132


In [240]:
X_test_tf = tf_idf.transform(test_data)

print("n_samples: %d, n_features: %d" % X_test_tf.shape)

n_samples: 4182, n_features: 7132


In [241]:
#naive bayes classifier
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tf, train_labels)
y_pred = naive_bayes_classifier.predict(X_test_tf) # predicted Y

In [242]:
print(metrics.classification_report(test_labels, y_pred, target_names=['Positive', 'Negative']))

              precision    recall  f1-score   support

    Positive       0.78      1.00      0.87      3000
    Negative       0.99      0.27      0.42      1182

    accuracy                           0.79      4182
   macro avg       0.88      0.63      0.65      4182
weighted avg       0.84      0.79      0.75      4182



In [243]:
print("Confusion matrix:")
print(metrics.confusion_matrix(test_labels, y_pred))

Confusion matrix:
[[2997    3]
 [ 864  318]]


In [244]:
from sklearn.linear_model import LogisticRegression

# Create a Logistic Regression model with L2 regularization
model_l2 = LogisticRegression(penalty='l2', C=1.0)  # C is the inverse of regularization strength

# With L1 regularization
model_l1 = LogisticRegression(penalty='l1', solver='liblinear', C=1.0)

# Train the model using the same training data as above
model_l2.fit(X_train_tf,train_labels)
model_l1.fit(X_train_tf, train_labels)

# Assuming X_train_tfidf is your TF-IDF transformed training data
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data)

In [245]:
from sklearn.metrics import classification_report
# Transform the test data with the TF-IDF vectorizer
X_test_tfidf = tfidf_vectorizer.transform(test_data)

#predict the labels using ridge and lasso regression such that it will minimize both loss term and Regularization term
y_pred_l2 = model_l2.predict(X_test_tfidf)
y_pred_l1 = model_l1.predict(X_test_tfidf)

# Print the classification reports for L1 and L2 Regularized Logistic Regression
print("Logistic Regression with L2 Regularization Classification Report:")
print(classification_report(test_labels, y_pred_l2, target_names=['Positive', 'Negative']))

print("Logistic Regression with L1 Regularization Classification Report:")
print(classification_report(test_labels, y_pred_l1, target_names=['Positive', 'Negative']))


Logistic Regression with L2 Regularization Classification Report:
              precision    recall  f1-score   support

    Positive       0.88      0.98      0.92      3000
    Negative       0.93      0.65      0.76      1182

    accuracy                           0.89      4182
   macro avg       0.90      0.81      0.84      4182
weighted avg       0.89      0.89      0.88      4182

Logistic Regression with L1 Regularization Classification Report:
              precision    recall  f1-score   support

    Positive       0.88      0.96      0.92      3000
    Negative       0.86      0.68      0.76      1182

    accuracy                           0.88      4182
   macro avg       0.87      0.82      0.84      4182
weighted avg       0.88      0.88      0.87      4182



#### Conclusion

So, to summarize this project,right after loading the data, it involves preprocessing the data, cleaning it, calculating frequencies of words individually, and then using those frequencies to compute the likelihoods, prior probabilities to make predictions. 

After predictions are made we assesed model performance, by calculating confusion matrix and printing classification report.

Finally, we got an accuracy of 78% with overall presicion, recall, and F-1 score for positive(51%), negative(90%).

But, on the other hand while doing it with Term frequency, IDF anf Logistic regression we found that model was balanced more then above report by accuracy of 79% with TF-IDF representation
and L2(ridge regression) regularization imporoved accuracy to 89%.