In [1]:
# Multinomial Logistic Regression Analysis for tweets.

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import re, string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
tweet_df = pd.read_csv("C:/Users/coryg/OneDrive/Desktop/STAT_574_Data_Mining/hw4STAT574S25/USAirlinesTweets.csv")
tweet_df = tweet_df[['sentiment', 'tweet']].dropna()

def preprocessing_tweets(tweets):
    tweets = tweets.lower()
    tweets = re.sub(r"http\S+|www\S+|https\S+", '', tweets, flags=re.MULTILINE)
    tweets = re.sub(r'\@[\w]+|\#', '', tweets)
    tweets = tweets.translate(str.maketrans('', '', string.punctuation))
    return tweets

tweet_df['cleaned_tweets'] = tweet_df['tweet'].apply(preprocessing_tweets)

# Text Vectorization to extract top 50 words from tweets

vectorization = CountVectorizer(max_features=50, stop_words='english')
X_mat = vectorization.fit_transform(tweet_df['cleaned_tweets'])
X_df = pd.DataFrame(X_mat.toarray(), columns=vectorization.get_feature_names_out())
y = tweet_df['sentiment']

labeler = LabelEncoder()
y_encode = labeler.fit_transform(y)

In [3]:
# Split into 80% training and 20% testing sets and run the model.

X_train, X_test, y_train, y_test = train_test_split(X_df, y_encode, test_size=0.2, 
                                                    random_state=5720255)

multi_logistic = LogisticRegression(multi_class='multinomial', solver='lbfgs',
                                    max_iter=900, random_state=987022)
multi_logistic.fit(X_train, y_train)




In [4]:
# Words most associated with Positive Tweets

coefs = pd.DataFrame(multi_logistic.coef_, columns=vectorization.get_feature_names_out())
coefs.index = labeler.classes_

top_pos = coefs.loc['positive'].sort_values(ascending=False).head(10)
top_neg = coefs.loc['negative'].sort_values(ascending=False).head(10)

print("Positive tweets associated with the following words:")
print(top_pos)
print("\nNegative tweets associated with the following words:")
print(top_neg)


Positive tweets associated with the following words:
thank      2.010026
great      1.656956
thanks     1.540171
good       0.899002
airline    0.605125
guys       0.456910
got        0.424672
really     0.392620
flying     0.349597
home       0.331437
Name: positive, dtype: float64

Negative tweets associated with the following words:
hours        1.339060
hold         1.234786
delayed      1.215004
hour         1.163527
delay        0.990636
waiting      0.865747
bag          0.842824
cancelled    0.809081
phone        0.794711
dont         0.785270
Name: negative, dtype: float64


In [5]:
# Computing prediction accuracy on testing set. 

y_pred = multi_logistic.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
print("accuracy: ", round(accuracy, 4))
print("confusion matrix: \n", confusion)

accuracy:  0.68
confusion matrix: 
 [[1735   33   51]
 [ 543   41   43]
 [ 256   11  215]]
