In [1]:
import numpy as np
import pandas as pd
import json
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv("/content/drive/Shareddrives/CIS 522/code/input/TikTok Data/batch_1_100/fyp_1649361969234.csv")
df.columns


Index(['id', 'secretID', 'text', 'createTime', 'authorMeta.id',
       'authorMeta.secUid', 'authorMeta.name', 'authorMeta.nickName',
       'authorMeta.verified', 'authorMeta.signature', 'authorMeta.avatar',
       'authorMeta.following', 'authorMeta.fans', 'authorMeta.heart',
       'authorMeta.video', 'authorMeta.digg', 'musicMeta.musicId',
       'musicMeta.musicName', 'musicMeta.musicAuthor',
       'musicMeta.musicOriginal', 'musicMeta.musicAlbum', 'musicMeta.playUrl',
       'musicMeta.coverThumb', 'musicMeta.coverMedium', 'musicMeta.coverLarge',
       'musicMeta.duration', 'covers.default', 'covers.origin',
       'covers.dynamic', 'webVideoUrl', 'videoUrl', 'videoUrlNoWaterMark',
       'videoApiUrlNoWaterMark', 'videoMeta.height', 'videoMeta.width',
       'videoMeta.duration', 'diggCount', 'shareCount', 'playCount',
       'commentCount', 'downloaded', 'mentions', 'hashtags', 'effectStickers'],
      dtype='object')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 44 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       100 non-null    int64  
 1   secretID                 100 non-null    int64  
 2   text                     100 non-null    object 
 3   createTime               100 non-null    int64  
 4   authorMeta.id            100 non-null    int64  
 5   authorMeta.secUid        100 non-null    object 
 6   authorMeta.name          100 non-null    object 
 7   authorMeta.nickName      100 non-null    object 
 8   authorMeta.verified      100 non-null    bool   
 9   authorMeta.signature     96 non-null     object 
 10  authorMeta.avatar        100 non-null    object 
 11  authorMeta.following     100 non-null    int64  
 12  authorMeta.fans          100 non-null    int64  
 13  authorMeta.heart         100 non-null    int64  
 14  authorMeta.video         10

In [None]:
#keep only response variables and features (hashtags or effects only)
icols_to_keep = [2, 8, 11, 12, 13, 14, 15, 19, 25, 33, 34, 35, 36, 37, 38, 39, 41, 42, 43]
df = df.iloc[:,icols_to_keep]
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   id                       100 non-null    int64 
 1   text                     100 non-null    object
 2   createTime               100 non-null    int64 
 3   authorMeta.verified      100 non-null    bool  
 4   authorMeta.following     100 non-null    int64 
 5   authorMeta.fans          100 non-null    int64 
 6   authorMeta.heart         100 non-null    int64 
 7   authorMeta.video         100 non-null    int64 
 8   authorMeta.digg          100 non-null    int64 
 9   musicMeta.musicOriginal  100 non-null    bool  
 10  musicMeta.duration       100 non-null    int64 
 11  videoMeta.height         100 non-null    int64 
 12  videoMeta.width          100 non-null    int64 
 13  videoMeta.duration       100 non-null    int64 
 14  diggCount                100 non-null    in

In [None]:
# parse effects and hashtags
hashtags = []
effects = []
df['hashtags'] = df['hashtags'].apply(json.loads)
df['effectStickers'] = df['effectStickers'].apply(json.loads)
for _, row in df.iterrows():
  for hashtag in row['hashtags']:
    if hashtag not in hashtags:
      hashtags.append(hashtag)
  for effect in row['effectStickers']:
    print(effect)
    if effect not in effects:
      effects.append(effect)

{'id': '479381', 'name': 'Face Zoom'}
{'id': '451761', 'name': 'Desaturación Parcial'}
{'id': '275816', 'name': '얼굴드래그'}
{'id': '454747', 'name': 'Green Screen Video'}
{'id': '263840', 'name': 'Green Screen'}


In [None]:
print(effects)

[{'id': '479381', 'name': 'Face Zoom'}, {'id': '451761', 'name': 'Desaturación Parcial'}, {'id': '275816', 'name': '얼굴드래그'}, {'id': '454747', 'name': 'Green Screen Video'}, {'id': '263840', 'name': 'Green Screen'}]


In [None]:
new_hashtag_cols = [d['id'] for d in hashtags]
new_effects_cols = [d['id'] for d in effects]

In [None]:
for col in new_hashtag_cols:
  df["hashtag_"+col] = False
for col in new_effects_cols:
  df["effect_"+col] = False

for i, row in df.iterrows():
  for hashtag in row['hashtags']:
    df.loc[i,"hashtag_"+hashtag['id']] = True
  for effect in row['effectStickers']:
    df.loc[i,"effect_"+effect['id']] = True

In [None]:
df.shape

(100, 332)

In [None]:
df.iloc[:, 1:30].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 29 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   text                      100 non-null    object
 1   createTime                100 non-null    int64 
 2   authorMeta.verified       100 non-null    bool  
 3   authorMeta.following      100 non-null    int64 
 4   authorMeta.fans           100 non-null    int64 
 5   authorMeta.heart          100 non-null    int64 
 6   authorMeta.video          100 non-null    int64 
 7   authorMeta.digg           100 non-null    int64 
 8   musicMeta.musicOriginal   100 non-null    bool  
 9   musicMeta.duration        100 non-null    int64 
 10  videoMeta.height          100 non-null    int64 
 11  videoMeta.width           100 non-null    int64 
 12  videoMeta.duration        100 non-null    int64 
 13  diggCount                 100 non-null    int64 
 14  shareCount                1

# Regression Models

In [None]:
X = df.drop(columns=['text',
                     'mentions',
                     'hashtags',
                     'effectStickers', 
                     "diggCount", 
                     'shareCount',
                     'playCount',
                     'commentCount',
                     'authorMeta.verified',
                     'authorMeta.following',
                     'authorMeta.fans',
                     'authorMeta.heart',
                     'authorMeta.video',
                     'authorMeta.digg'])
y = df[['playCount']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)
print(mean_squared_error(y_pred, y_test))
print(r2_score(y_pred, y_test))

-4.780823803529614