In [242]:
import pandas as pd
import numpy as np

df = pd.read_csv('X_Dec_2023.csv')

selected_columns = ['username', 'timestamp', 'text', 'likes', 'replies', 'retweets', 'quotes']
df = df[selected_columns]
# drop duplicates
df = df.drop_duplicates(subset=['text'])
# df = df.dropna()
df.head(10)

Unnamed: 0,username,timestamp,text,likes,replies,retweets,quotes
0,@BrotherMingGame,2023-12-19 06:58:00+00:00,Hasboro laying off 1100 employees right before...,40,3,13,0
1,@Fiendlyg,2023-12-19 06:56:00+00:00,I follow back in 1 SECOND all below 83K \r\n(T...,45,19,20,0
2,@NekoSurfAI,2023-12-19 06:53:00+00:00,Good morning Master. Come we have to wake up.....,24,1,4,0
3,@rowancheung,2023-12-19 06:51:00+00:00,A new DeepMind paper just revealed that LLMs s...,17,1,1,0
4,@AssTransformers,2023-12-19 06:50:00+00:00,"Oh, that's Hasbro laying off 1100 employees an...",63,4,11,0
5,@rowancheung,2023-12-19 06:50:00+00:00,A new AI study found that automated bots can n...,19,2,3,2
6,@rowancheung,2023-12-19 06:50:00+00:00,Jailed opposition leader Imran Khan just used ...,39,1,8,5
7,@_svs_,2023-12-19 06:50:00+00:00,So many Gen AI startups hiring now. \r\n\r\nTh...,28,4,1,1
8,@rowancheung,2023-12-19 06:50:00+00:00,OpenAI just published a new safety preparednes...,15,2,3,0
9,@rowancheung,2023-12-19 06:50:00+00:00,Huge developments in the world of AI today.\r\...,78,7,18,1


## Processing text

In [243]:
import re

# add number of words in each tweet
df['num_words'] = df['text'].apply(lambda x: len(x.split()))
# add number of characters in each tweet
df['num_chars'] = df['text'].apply(lambda x: len(x))

# add length of each tweet
print(df['num_words'].max())
print(df['num_chars'].max())
print(df['num_words'].mean())
print(df['num_chars'].mean())
df['length'] = np.minimum(df['num_words'] * 1.0 / 50, 10) + np.minimum(df['num_chars'] * 1.0 / 500, 10)
df['length'] = np.maximum(df['length'], 1)

# add number of sentences and new line in each tweet
df['num_sentences'] = df['text'].apply(lambda x: len(re.split('\n|\.', x)))
# add complexity of each tweet
df['complexity'] = df['num_words'] / (df['num_sentences'] + 1)

# group by username and calculate the time diff between each tweet
df['timestamp'] = pd.to_datetime(df['timestamp'])                           

1932
12367
32.89126592875081
205.26797507208633


In [244]:
df = df.sort_values(by=['username', 'timestamp'])
df['time_diff'] = df.groupby('username')['timestamp'].diff().fillna(pd.Timedelta(seconds=0))
# transform time_diff to minutes
df['time_diff'] = df['time_diff'].apply(lambda x: x.total_seconds() / 60)
# only keep the users who have more than 3 tweet
df = df.groupby('username').filter(lambda x: len(x) > 10)
df = df.groupby('username').filter(lambda x:  100 < x['likes'].mean() < 120)

In [245]:
df['next_length'] = df.groupby('username')['length'].shift(-1)
df['next_complexity'] = df.groupby('username')['complexity'].shift(-1)
df['next_tweet_time_gap'] = df.groupby('username')['time_diff'].shift(-1)
df = df.dropna(subset=['next_length', 'next_complexity'])
print(df.shape)
df.head(10)

(23, 16)


Unnamed: 0,username,timestamp,text,likes,replies,retweets,quotes,num_words,num_chars,length,num_sentences,complexity,time_diff,next_length,next_complexity,next_tweet_time_gap
91,@_akhaliq,2023-12-19 05:41:00+00:00,Catwalk: A Unified Language Model Evaluation F...,16,0,4,1,165,1176,5.652,16,9.705882,0.0,4.742,8.0,10.0
28,@_akhaliq,2023-12-19 05:51:00+00:00,Cascade Speculative Drafting for Even Faster L...,40,0,5,0,136,1011,4.742,16,8.0,10.0,7.028,12.058824,545.0
604,@_akhaliq,2023-12-19 14:56:00+00:00,An In-depth Look at Gemini's Language Abilitie...,57,3,9,3,205,1464,7.028,16,12.058824,545.0,8.584,13.0,689.0
545,@_akhaliq,2023-12-20 02:25:00+00:00,A Challenger to GPT-4V? Early Explorations of ...,20,0,5,1,247,1822,8.584,18,13.0,689.0,6.858,10.526316,1505.0
1972,@_akhaliq,2023-12-20 19:30:00-08:00,PowerInfer: Fast Large Language Model Serving ...,169,5,28,7,200,1429,6.858,18,10.526316,1505.0,5.508,10.333333,4.0
1783,@_akhaliq,2023-12-20 19:34:00-08:00,Mini-GPTs: Efficient Large Language Models thr...,150,0,42,5,155,1204,5.508,14,10.333333,4.0,9.084,17.866667,7142.0
3228,@_akhaliq,2023-12-26 02:36:00+00:00,LLM4VG: Large Language Models Evaluation for V...,59,0,16,0,268,1862,9.084,14,17.866667,7142.0,5.218,10.333333,20.0
3239,@_akhaliq,2023-12-26 02:56:00+00:00,Exploiting Novel GPT-4 APIs\r\n\r\npaper page:...,127,4,20,5,155,1059,5.218,14,10.333333,20.0,2.042,5.272727,33.0
3227,@_akhaliq,2023-12-26 03:29:00+00:00,InternVL: Scaling up Vision Foundation Models ...,82,1,11,1,58,441,2.042,10,5.272727,33.0,8.998,13.684211,83.0
3154,@_akhaliq,2023-12-26 04:52:00+00:00,Generative AI Beyond LLMs: System Implications...,63,0,19,0,260,1899,8.998,18,13.684211,83.0,9.9,13.714286,1341.0


## Linear Regression

In [246]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Independent variables
X = df[['likes', 'replies', 'retweets', 'quotes']]

# Dependent variable, assuming you start with predicting 'next_length'
y1 = df['next_length']
y2 = df['next_complexity']
y3 = df['next_tweet_time_gap']

# Splitting dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.2, random_state=42)

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
print('R²: %.2f' % r2_score(y_test, y_pred))
print('Coefficients:', model.coef_)
print('Intercept:', model.intercept_)

Mean squared error: 8.21
R²: 0.38
Coefficients: [-0.03551273 -0.4268764   0.21733195  0.1929561 ]
Intercept: 4.559222828033974


## Log-Log Linear Regression

In [247]:
df = df.replace(0, np.finfo(float).eps)
df['log_likes'] = np.log(df['likes'])
df['log_replies'] = np.log(df['replies'])
df['log_quotes'] = np.log(df['quotes'])
df['log_retweets'] = np.log(df['retweets'])
df['log_next_length'] = np.log(df['next_length'])
df['log_next_complexity'] = np.log(df['next_complexity'])
df['log_next_tweet_time_gap'] = np.log(df['next_tweet_time_gap'])

X_log = df[['log_likes', 'log_replies', 'log_retweets', 'log_quotes']]

# Dependent variable for the example
y1_log = df['log_next_length']
y2_log = df['log_next_complexity']
y3_log = df['log_next_tweet_time_gap']

# Splitting dataset into training and testing sets
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X_log, y1_log, test_size=0.2, random_state=42)

# Initialize and train the linear regression model
model_log = LinearRegression()
model_log.fit(X_train_log, y_train_log)

# Predictions
y_pred_log = model_log.predict(X_test_log)

# Evaluate the model
print("Mean squared error: %.2f" % mean_squared_error(y_test_log, y_pred_log))
print('R²: %.2f' % r2_score(y_test_log, y_pred_log))
print('Coefficients:', model_log.coef_)

Mean squared error: 0.47
R²: 0.40
Coefficients: [-0.33999488  0.01649804  0.0568638  -0.01823643]


## Linear Regression with Lag

In [248]:

df = df.sort_values(by=['username', 'timestamp'])
k = 3

# Create lagged features for the past three tweets
for lag in range(1, k):  # 1, 2
    df[f'likes_lag_{lag}'] = df.groupby('username')['likes'].shift(lag)
    df[f'replies_lag_{lag}'] = df.groupby('username')['replies'].shift(lag)
    df[f'retweets_lag_{lag}'] = df.groupby('username')['retweets'].shift(lag)
    df[f'quotes_lag_{lag}'] = df.groupby('username')['quotes'].shift(lag)

df.dropna(inplace=True)

feature_columns = ['likes', 'replies', 'retweets', 'quotes'] + \
                  [f'{metric}_lag_{lag}' for metric in ['likes', 'replies', 'retweets', 'quotes'] for lag in range(1, k)]

X = df[feature_columns] 

In [249]:
y1 = df['next_length']
y2 = df['next_complexity']
y3 = df['next_tweet_time_gap']

X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
print('R²: %.2f' % r2_score(y_test, y_pred))
print('Coefficients:', model.coef_) 
print('Intercept:', model.intercept_)

Mean squared error: 63.13
R²: -5.89
Coefficients: [ 0.00253965 -0.20041647  0.08347753  0.25164765  0.01964136 -0.02434141
  1.32869883  0.31767713 -0.23437426  0.46436111  0.00534071  0.26578767]
Intercept: -1.4294423825984737


## Log-Log Linear Regression with Lag

In [250]:
for lag in range(1, k):  # 1, 2
    df[f'log_likes_lag_{lag}'] = df.groupby('username')['log_likes'].shift(lag)
    df[f'log_replies_lag_{lag}'] = df.groupby('username')['log_replies'].shift(lag)
    df[f'log_retweets_lag_{lag}'] = df.groupby('username')['log_retweets'].shift(lag)
    df[f'log_quotes_lag_{lag}'] = df.groupby('username')['log_quotes'].shift(lag)

df.dropna(inplace=True)

feature_columns = ['likes', 'replies', 'retweets', 'quotes'] + \
                  [f'{metric}_lag_{lag}' for metric in ['likes', 'replies', 'retweets', 'quotes'] for lag in range(1, k)]

X = df[feature_columns] 

In [251]:
y1_log = df['log_next_length']
y2_log = df['log_next_complexity']
y3_log = df['log_next_tweet_time_gap']

X_train, X_test, y_train, y_test = train_test_split(X, y1_log, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
print('R²: %.2f' % r2_score(y_test, y_pred))
print('Coefficients:', model.coef_) 
print('Intercept:', model.intercept_)

Mean squared error: 2.22
R²: -3.78
Coefficients: [-0.00630158  0.19346499  0.0527344  -0.13083759  0.01454434  0.01513702
  0.44895178  0.29500037 -0.04551227  0.02010813 -0.05279361 -0.25852628]
Intercept: -1.612512747522549


In [252]:
df.head(10)

Unnamed: 0,username,timestamp,text,likes,replies,retweets,quotes,num_words,num_chars,length,...,retweets_lag_2,quotes_lag_2,log_likes_lag_1,log_replies_lag_1,log_retweets_lag_1,log_quotes_lag_1,log_likes_lag_2,log_replies_lag_2,log_retweets_lag_2,log_quotes_lag_2
1972,@_akhaliq,2023-12-20 19:30:00-08:00,PowerInfer: Fast Large Language Model Serving ...,169,5.0,28.0,7.0,200,1429,6.858,...,9.0,3.0,2.995732,-36.043653,1.609438,0.0,4.043051,1.098612,2.197225,1.098612
1783,@_akhaliq,2023-12-20 19:34:00-08:00,Mini-GPTs: Efficient Large Language Models thr...,150,2.220446e-16,42.0,5.0,155,1204,5.508,...,5.0,1.0,5.129899,1.609438,3.332205,1.94591,2.995732,-36.043653,1.609438,0.0
3228,@_akhaliq,2023-12-26 02:36:00+00:00,LLM4VG: Large Language Models Evaluation for V...,59,2.220446e-16,16.0,2.220446e-16,268,1862,9.084,...,28.0,7.0,5.010635,-36.043653,3.73767,1.609438,5.129899,1.609438,3.332205,1.94591
3239,@_akhaliq,2023-12-26 02:56:00+00:00,Exploiting Novel GPT-4 APIs\r\n\r\npaper page:...,127,4.0,20.0,5.0,155,1059,5.218,...,42.0,5.0,4.077537,-36.043653,2.772589,-36.043653,5.010635,-36.043653,3.73767,1.609438
3227,@_akhaliq,2023-12-26 03:29:00+00:00,InternVL: Scaling up Vision Foundation Models ...,82,1.0,11.0,1.0,58,441,2.042,...,16.0,2.220446e-16,4.844187,1.386294,2.995732,1.609438,4.077537,-36.043653,2.772589,-36.043653
3154,@_akhaliq,2023-12-26 04:52:00+00:00,Generative AI Beyond LLMs: System Implications...,63,2.220446e-16,19.0,2.220446e-16,260,1899,8.998,...,20.0,5.0,4.406719,0.0,2.397895,0.0,4.844187,1.386294,2.995732,1.609438
3708,@_akhaliq,2023-12-26 19:13:00-08:00,Gemini vs GPT-4V: A Preliminary Comparison and...,203,6.0,44.0,6.0,288,2070,9.9,...,11.0,1.0,4.143135,-36.043653,2.944439,-36.043653,4.406719,0.0,2.397895,0.0
1895,@futuristflower,2023-12-21 15:29:00+00:00,"1) When I had closed beta access to DALL-E 2, ...",85,2.220446e-16,2.0,2.220446e-16,54,280,1.64,...,3.0,2.0,3.044522,0.693147,-36.043653,-36.043653,4.744932,-36.043653,1.098612,0.693147
1894,@futuristflower,2023-12-21 15:33:00+00:00,I mean even MJ V5 looks better in faces and ph...,91,2.220446e-16,5.0,1.0,50,280,1.56,...,2.220446e-16,2.220446e-16,4.442651,-36.043653,0.693147,-36.043653,3.044522,0.693147,-36.043653,-36.043653
2685,@futuristflower,2023-12-23 01:52:00+00:00,GPT-4 when doing a web search is the closest w...,137,2.220446e-16,6.0,8.0,15,69,1.0,...,2.0,2.220446e-16,4.51086,-36.043653,1.609438,0.0,4.442651,-36.043653,0.693147,-36.043653
