In [14]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Word lists and lexicons in nltk: https://www.nltk.org/howto/corpus.html#word-lists-and-lexicons
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
sia.polarity_scores("This is a really great tweet!")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\danli\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


{'neg': 0.0, 'neu': 0.461, 'pos': 0.539, 'compound': 0.6893}

In [15]:
import pandas as pd
df_tweets = pd.read_csv('https://www.ishelp.info/data/tweets_aws.csv')
df_tweets.drop(columns=['Sentiment'], inplace=True)
df_tweets.head()
      
# Output
# # See the output in your own notebook


Unnamed: 0,Gender,Weekday,Hour,Day,Reach,RetweetCount,Klout,text
0,Male,Monday,23,2,4037,1,52,Amazon Web Services is becoming a nice predict...
1,Unknown,Friday,12,4,524418,21,72,Announcing four new VPN features in our Sao Pa...
2,Unknown,Tuesday,9,31,1748,1,46,Are you an @awscloud user? Use #Zadara + #AWS ...
3,Unknown,Saturday,3,27,1179,1,0,AWS CloudFormation Adds Support for Amazon VPC...
4,Unknown,Saturday,3,27,1179,1,0,AWS CloudFormation Adds Support for Amazon VPC...


In [16]:
df_tweets['sentiment_overall'] = 0.0
df_tweets['sentiment_neg'] = 0.0
df_tweets['sentiment_neu'] = 0.0
df_tweets['sentiment_pos'] = 0.0
    
for row in df_tweets.itertuples():
    sentiment = sia.polarity_scores(row[8])
    df_tweets.loc[row[0], 'sentiment_overall'] = sentiment['compound']
    df_tweets.loc[row[0], 'sentiment_neg'] = sentiment['neg']
    df_tweets.loc[row[0], 'sentiment_neu'] = sentiment['neu']
    df_tweets.loc[row[0], 'sentiment_pos'] = sentiment['pos']

df_tweets.head()

Unnamed: 0,Gender,Weekday,Hour,Day,Reach,RetweetCount,Klout,text,sentiment_overall,sentiment_neg,sentiment_neu,sentiment_pos
0,Male,Monday,23,2,4037,1,52,Amazon Web Services is becoming a nice predict...,0.7757,0.0,0.508,0.492
1,Unknown,Friday,12,4,524418,21,72,Announcing four new VPN features in our Sao Pa...,0.0,0.0,1.0,0.0
2,Unknown,Tuesday,9,31,1748,1,46,Are you an @awscloud user? Use #Zadara + #AWS ...,0.0,0.0,1.0,0.0
3,Unknown,Saturday,3,27,1179,1,0,AWS CloudFormation Adds Support for Amazon VPC...,0.6249,0.0,0.711,0.289
4,Unknown,Saturday,3,27,1179,1,0,AWS CloudFormation Adds Support for Amazon VPC...,0.6249,0.0,0.711,0.289


In [21]:
import pandas as pd
import statsmodels.api as sm

# Convert categorical variables to dummy variables
df_dummy = pd.get_dummies(df_tweets, columns=['Gender', 'Weekday'], drop_first=True)

# Define dependent and independent variables
y = df_dummy['RetweetCount']

X = df_dummy.drop(columns=['text', 'RetweetCount', 'sentiment_overall', 'sentiment_neg', 'sentiment_neu', 'sentiment_pos'])

# Ensure all columns are numeric and convert bool to int
X = X.apply(pd.to_numeric, errors='coerce').astype(int)

# Drop rows with NaN values
X = X.dropna()

# Add constant term
X = sm.add_constant(X)

# Align y with X
y = y.loc[X.index]

# Fit and summarize model
print(sm.OLS(y, X).fit().summary())


                            OLS Regression Results                            
Dep. Variable:           RetweetCount   R-squared:                       0.246
Model:                            OLS   Adj. R-squared:                  0.236
Method:                 Least Squares   F-statistic:                     24.73
Date:                Thu, 27 Mar 2025   Prob (F-statistic):           4.08e-52
Time:                        09:04:20   Log-Likelihood:                -4321.1
No. Observations:                1000   AIC:                             8670.
Df Residuals:                     986   BIC:                             8739.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                -8.3119      5.79

In [19]:
print(X.dtypes)  # Identify which columns are not numeric


const                float64
Hour                   int64
Day                    int64
Reach                  int64
Klout                  int64
Gender_Male             bool
Gender_Unisex           bool
Gender_Unknown          bool
Weekday_Monday          bool
Weekday_Saturday        bool
Weekday_Sunday          bool
Weekday_Thursday        bool
Weekday_Tuesday         bool
Weekday_Wednesday       bool
dtype: object
