In [None]:
import pandas as pd
from scipy.stats import pearsonr
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [9]:
data = pd.read_csv('sampled_data_preprocessed.csv')

In [10]:
data

Unnamed: 0,tweet_id,language,tweet_type,original_tweet_id,tweet_text,user_id,place_id,tweet_creation,hashtags,mentions,...,hashtags_count,mentions_tokens,mentions_count,user_influence,tweet_day,tweet_hour,is_weekend,hashtags_embedding,mentions_embedding,tweet_text_embedding
0,76,en,Original,0,kinda twttring around and such,13,0,2006-03-22 00:00:51+00:00,,,...,0,[],0,5248566,2,0,False,"[-0.11883842945098877, 0.04829872027039528, -0...","[-0.11883842945098877, 0.04829872027039528, -0...","[-0.01654781773686409, 0.0477893240749836, 0.0..."
1,175531324168351745,en,Original,0,"""Hydrogen peroxide added in small amounts to d...",503368661,0,2012-03-02 10:41:43+00:00,,,...,0,[],0,46,4,10,False,"[-0.11883842945098877, 0.04829872027039528, -0...","[-0.11883842945098877, 0.04829872027039528, -0...","[-0.025313112884759903, 0.051009830087423325, ..."
2,286039083011170304,ja,Original,0,"‘Kato Shahi, love you! in TV, bring the topic ...",436851409,0,2013-01-01 09:20:06+00:00,,,...,0,[],0,2747,1,9,False,"[-0.11883842945098877, 0.04829872027039528, -0...","[-0.11883842945098877, 0.04829872027039528, -0...","[-0.07220662385225296, -0.019841067492961884, ..."
3,286462102284148738,ca,Original,0,Hydrogen peroxide vapor enhances hospital disi...,119054506,0,2013-01-02 13:21:02+00:00,,,...,0,[],0,955,2,13,False,"[-0.11883842945098877, 0.04829872027039528, -0...","[-0.11883842945098877, 0.04829872027039528, -0...","[-0.003000939264893532, -0.010538204573094845,..."
4,286834468126347264,ja,Original,0,CNIC News 2011/10/13 3 aircraft explosion is a...,88442750,0,2013-01-03 14:00:40+00:00,,,...,0,[],0,327,3,14,False,"[-0.11883842945098877, 0.04829872027039528, -0...","[-0.11883842945098877, 0.04829872027039528, -0...","[-0.01976289041340351, 0.07890097796916962, 0...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10005,1608989763708801025,ja,Reply,1608989365975547904,@gomaaburaseizin I laughed and laughed how muc...,1605132914915946497,0,2022-12-31 00:53:47+00:00,,|gomaaburaseizin,...,0,['gomaaburaseizin'],1,536,5,0,True,"[-0.11883842945098877, 0.04829872027039528, -0...","[0.006109271198511124, 0.162232905626297, -0.0...","[-0.01719362847507, 0.030254265293478966, 0.03..."
10006,1609073470541041665,qme,Reply,1609064727212326917,@irgxf @adndotcom @AlaskaBeacon https://t.co/j...,58257535,0,2022-12-31 06:26:25+00:00,,|irgxf|adndotcom|AlaskaBeacon,...,0,"['irgxf', 'adndotcom', 'alaskabeacon']",3,89,5,6,True,"[-0.11883842945098877, 0.04829872027039528, -0...","[0.010872884653508663, -0.0415927991271019, -0...","[-0.05069870129227638, 0.01034287828952074, 0...."
10007,1609160340889010177,en,Original,0,Cummins Participates in the 6th Foshan Hydroge...,1059819391,0,2022-12-31 12:11:36+00:00,,,...,0,[],0,2,5,12,True,"[-0.11883842945098877, 0.04829872027039528, -0...","[-0.11883842945098877, 0.04829872027039528, -0...","[-0.04631892591714859, 0.008540322072803974, -..."
10008,1609237428476297216,zxx,Original,0,https://t.co/dOggyMSw5F,173511443,0,2022-12-31 17:17:55+00:00,,,...,0,[],0,535,5,17,True,"[-0.11883842945098877, 0.04829872027039528, -0...","[-0.11883842945098877, 0.04829872027039528, -0...","[-0.04486410692334175, -0.006326509173959494, ..."


In [34]:
# Select characteristic variables
features = [
    'sentiment', 'hashtags_count', 'mentions_count', 'user_influence', 'tweet_day', 'tweet_hour', 'is_weekend'
]

# Target variables (number of likes, number of retweets, number of comments)
target = ['like_count', 'retweet_count', 'reply_count']

# Extract features and goals
X = data[features]
y = data[target]

# Carry out corresponding data processing
X = X.fillna(0)

In [35]:
# Correlation analysis
correlation_results = {}
for col in features:
    for target_col in target:
        corr, _ = pearsonr(X[col], data[target_col])
        correlation_results[f"{col} vs {target_col}"] = corr

# Turn the relevance results into DataFrame and sort them
correlation_df = pd.DataFrame(correlation_results.items(), columns=['Feature vs Target', 'Correlation'])
correlation_df = correlation_df.sort_values(by='Correlation', ascending=False)

In [36]:
correlation_df

Unnamed: 0,Feature vs Target,Correlation
9,user_influence vs like_count,0.2701
11,user_influence vs reply_count,0.23528
10,user_influence vs retweet_count,0.225489
18,is_weekend vs like_count,0.018557
19,is_weekend vs retweet_count,0.017426
12,tweet_day vs like_count,0.017351
20,is_weekend vs reply_count,0.016788
14,tweet_day vs reply_count,0.016687
13,tweet_day vs retweet_count,0.01642
2,sentiment vs reply_count,0.013099


In [37]:
print(correlation_df)

                  Feature vs Target  Correlation
9      user_influence vs like_count     0.270100
11    user_influence vs reply_count     0.235280
10  user_influence vs retweet_count     0.225489
18         is_weekend vs like_count     0.018557
19      is_weekend vs retweet_count     0.017426
12          tweet_day vs like_count     0.017351
20        is_weekend vs reply_count     0.016788
14         tweet_day vs reply_count     0.016687
13       tweet_day vs retweet_count     0.016420
2          sentiment vs reply_count     0.013099
0           sentiment vs like_count     0.010845
1        sentiment vs retweet_count     0.009714
17        tweet_hour vs reply_count     0.007964
15         tweet_hour vs like_count     0.007922
16      tweet_hour vs retweet_count     0.007918
4   hashtags_count vs retweet_count     0.002398
3      hashtags_count vs like_count     0.002302
5     hashtags_count vs reply_count     0.001948
6      mentions_count vs like_count    -0.003956
8     mentions_count

In [58]:
# Select features: Select the characteristics with higher correlation according to the results of correlation analysis.
selected_features = [
    'sentiment', 'hashtags_count', 'mentions_count', 'user_influence', 
    'tweet_day', 'tweet_hour', 'is_weekend'
]

# Target variables
targets = ['like_count', 'retweet_count', 'reply_count']

# Extract features
X = data[selected_features]

# The characteristics are normalized
scaler_X = MinMaxScaler()
X.loc[:, selected_features] = scaler_X.fit_transform(X[selected_features])

# Model and evaluate each target variable
for target in targets:
    print(f"\nAnalyzing {target}:\n")
    
    # Extract the current target variable
    y = data[target]

    # Data set division (80% training set, 20% test set)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create and train linear regression models
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)

    # Predict
    y_pred = lr_model.predict(X_test)

    # Evaluate the performance of the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Mean Squared Error (MSE): {mse}")
    print(f"R-squared (R2): {r2}")

    # Check the model coefficient to quantify the impact of each feature on the target variable.
    coefficients = pd.DataFrame({
        'Feature': selected_features,
        'Coefficient': lr_model.coef_
    }).sort_values(by='Coefficient', ascending=False)

    print("Feature Coefficients:")
    print(coefficients)



Analyzing like_count:

Mean Squared Error (MSE): 3209536609.915892
R-squared (R2): 0.029613205459983916
Feature Coefficients:
          Feature    Coefficient
3  user_influence  111948.729412
4       tweet_day      57.869640
5      tweet_hour      -4.011349
0       sentiment     -12.215329
6      is_weekend     -88.134453
1  hashtags_count    -180.852874
2  mentions_count    -276.850610

Analyzing retweet_count:

Mean Squared Error (MSE): 261573878.9196706
R-squared (R2): 0.006396734350141875
Feature Coefficients:
          Feature  Coefficient
3  user_influence  7074.069282
5      tweet_hour    21.428001
4       tweet_day    20.055681
0       sentiment   -12.868481
6      is_weekend   -38.704625
1  hashtags_count   -77.686352
2  mentions_count  -140.478090

Analyzing reply_count:

Mean Squared Error (MSE): 50287788.14901828
R-squared (R2): 0.013498359025094997
Feature Coefficients:
          Feature  Coefficient
3  user_influence  6305.429303
4       tweet_day    32.850604
0       se