# SIADS Milestone I

- **Cydia Tsang (cydia@umich.edu)**, School of Information, University of Michigan
- **Yao Tong (tonyao@umich.edu)**, School of Information, University of Michigan



In [41]:
import sys, os
import json
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
import nltk
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from scipy.stats import pearsonr
from boto3.dynamodb.types import TypeDeserializer
sys.path.append(os.getcwd() + "/src")
from src.pj_config.dynamo_db_client import get_dynamo_db_client
from decimal import Decimal

# 1. Data Improt

In [75]:
def convert_dynamodb_response(response):
    deserializer = TypeDeserializer()
    def parse_item(item):
        return {key: deserializer.deserialize(value) for key, value in item.items()}
  
    items_temp = [parse_item(item) for item in items]
    df = pd.DataFrame(items_temp)
    df = df.applymap(lambda x: x if not isinstance(x, dict) else x.get('S', x))
    return df
    
dynamodb = get_dynamo_db_client()
response = dynamodb.scan(TableName='tiktok_trending')

items=response["Items"]

while 'LastEvaluatedKey' in response and response['LastEvaluatedKey'] != "":  
    response = dynamodb.scan(TableName='tiktok_trending', ExclusiveStartKey=response['LastEvaluatedKey'])
    items.extend(response["Items"])
df = convert_dynamodb_response(items)


                                              hashtags diggCount  \
0    [{'name': 'vjp', 'cover': '', 'id': '30368892'...     22100   
1    [{'name': 'kesfet', 'cover': '', 'id': '715598...     25200   
2                                                   []      1339   
3    [{'name': 'nlunatics', 'cover': '', 'id': '166...    426700   
4                                                   []      1411   
..                                                 ...       ...   
995  [{'name': 'hetclubhuis', 'cover': '', 'id': '1...     22400   
996  [{'name': 'fitness', 'cover': 'https://p16-sg....       700   
997  [{'name': 'fy', 'cover': '', 'id': '153828', '...      2008   
998                                                 []       779   
999                                                 []      8146   

                                           webVideoUrl videoUrlNoWaterMark  \
0    https://www.tiktok.com/@ivarvanleijsen/video/6...                       
1    https://www.tiktok.com

# 2. Data Cleaning & Data Manipulation

In [8]:
###### User
user_df = df[['id', 'createTime', 'authorMeta', 'diggCount', 'commentCount', 'shareCount', 'playCount', 'downloaded']]

In [9]:
###### Music
music_df = df[['id', 'createTime', 'musicMeta', 'diggCount', 'commentCount', 'shareCount', 'playCount', 'downloaded']]

In [46]:
###### Video
video_df = df[['id', 'createTime', 'videoMeta', 'diggCount', 'commentCount', 'shareCount', 'playCount', 'downloaded']]
video_df = video_df.copy()

video_df['videoMeta'] = video_df['videoMeta'].apply(lambda x: {'width': int(x['width']), 'duration': int(x['duration']), 'height': int(x['height'])})
video_df[['video_width', 'video_duration', 'video_height']] = pd.DataFrame(video_df['videoMeta'].tolist())

video_df['video_duration'] = video_df['video_duration'].astype(float)
video_df['diggCount'] = video_df['diggCount'].astype(float)
video_df['shareCount'] = video_df['shareCount'].astype(float)
video_df['playCount'] = video_df['playCount'].astype(float)
video_df['commentCount'] = video_df['commentCount'].astype(float)
print(len(video_df))

393


In [47]:
###### Text
text_df = df[['id', 'createTime', 'text', 'mentions', 'diggCount', 'commentCount', 'shareCount', 'playCount', 'downloaded']]

def process_numeric_column(df, column_name):
    df.loc[:, column_name] = pd.to_numeric(df[column_name], errors='coerce')
    df_copy = df.copy()
    quantiles = df_copy[column_name].quantile([0, 1/5, 2/5, 3/5, 4/5, 1])
    df_copy[f'{column_name}Category'] = pd.cut(df_copy[column_name], bins=quantiles, labels=['very low', 'low', 'medium', 'high', 'very high'], include_lowest=True)
    return df_copy
    
text_df = process_numeric_column(text_df, 'diggCount')
text_df = process_numeric_column(text_df, 'commentCount')
text_df = process_numeric_column(text_df, 'shareCount')
text_df = process_numeric_column(text_df, 'playCount')

def extract_hashtags(text):
    return re.findall(r'#(\w+)', text)

unique_hashtags = set()
for text in text_df['text']:
    unique_hashtags.update(extract_hashtags(text))
print(len(video_df))

393


# 3. Analysis

In [12]:
###### User

In [13]:
###### Music

In [14]:
###### Video
# Define the correlation function
def correlation_analysis(column_name):
    correlation_coefficient, _ = pearsonr(video_df['video_duration'], video_df[column_name])
    return correlation_coefficient

# Compute correlation coefficients for each variable
digg_corr = correlation_analysis('diggCount')
share_corr = correlation_analysis('shareCount')
play_corr = correlation_analysis('playCount')
comment_corr = correlation_analysis('commentCount')
print("Correlation Coefficient Calculated by pearsonr")
print(f"Correlation Coefficient (video_duration vs diggCount): {digg_corr}")
print(f"Correlation Coefficient (video_duration vs shareCount): {share_corr}")
print(f"Correlation Coefficient (video_duration vs playCount): {play_corr}")
print(f"Correlation Coefficient (video_duration vs commentCount): {comment_corr}")

X = video_df[['diggCount', 'shareCount', 'playCount', 'commentCount']]
y = video_df['video_duration']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict video duration for the test set
y_pred = model.predict(X_test)

# Assess the model fit
print(f"Coefficients: {model.coef_}")
print(f"Intercept: {model.intercept_}")
print(f"R-squared: {metrics.r2_score(y_test, y_pred)}")
print(f"P-values: {model.coef_}")  # Note: scikit-learn doesn't directly provide p-values


Correlation Coefficient Calculated by pearsonr
Correlation Coefficient (video_duration vs diggCount): -0.05453050846293269
Correlation Coefficient (video_duration vs shareCount): -0.026638553770916902
Correlation Coefficient (video_duration vs playCount): -0.06466827066002559
Correlation Coefficient (video_duration vs commentCount): -0.03163982716350423
Coefficients: [-3.10448381e-06 -5.11184556e-06  6.61897793e-08  4.28275604e-05]
Intercept: 18.23572224710861
R-squared: -0.021456205129115835
P-values: [-3.10448381e-06 -5.11184556e-06  6.61897793e-08  4.28275604e-05]


In [44]:
###### Text
def conditional_probability(df, condition_column, condition_value, target_column, target_value):
    filtered_data = df[df[condition_column].apply(lambda x: condition_value in x)]
    total_occurrences = len(filtered_data)
    target_occurrences = len(filtered_data[filtered_data[target_column] == target_value])
    
    if total_occurrences == 0:
        probability = 0  # Avoid division by zero
    else:
        probability = target_occurrences / total_occurrences
    if(probability > 0):
        result_df = pd.DataFrame([[target_column, target_value, condition_value, probability]],
                                 columns=['TargetColumn', 'TargetValue', 'Hashtag', 'Probability'])
        return result_df

result_dfs = []
for category in ['diggCountCategory', 'shareCountCategory', 'playCountCategory', 'commentCountCategory']:
    for hashtag in unique_hashtags:
        for target_value in ['very low', 'low', 'medium', 'high', 'very high']:
            result_df = conditional_probability(text_df, 'text', hashtag, category, target_value)
            result_dfs.append(result_df)
results = pd.concat(result_dfs, ignore_index=True)
print(results)

              TargetColumn TargetValue          Hashtag  Probability
0        diggCountCategory    very low           cardio          1.0
1        diggCountCategory         low       blowthisup          1.0
2        diggCountCategory         low         cleaning          1.0
3        diggCountCategory         low      foryourpage          0.5
4        diggCountCategory      medium      foryourpage          0.5
...                    ...         ...              ...          ...
6766  commentCountCategory        high  halloweenishere          1.0
6767  commentCountCategory    very low        waitforit          1.0
6768  commentCountCategory    very low        dlaciebie          1.0
6769  commentCountCategory    very low         farmlife          1.0
6770  commentCountCategory        high       broscience          1.0

[6771 rows x 4 columns]


# 4. Data Visualisation

In [None]:
###### User

In [None]:
###### Music

In [None]:
###### Video
num_cols = ["diggCount", "commentCount", "shareCount", "playCount"]
plt.figure(figsize=(16, 12))
for i, col in enumerate(num_cols, start=1):
    plt.subplot(3, 2, i)
    sns.scatterplot(x=col, y='video_duration', data=video_df)
    plt.title(f'Scatter Plot: video_duration vs {col}')

plt.tight_layout()
plt.show()


# Correlation heatmap
video_new_df = video_df[["diggCount", "commentCount", "shareCount", "playCount", "video_duration"]]
plt.figure(figsize=(10, 8))
correlation_matrix = video_new_df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()


In [None]:
###### Text