In [37]:
import json
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import datetime
import calendar

In [73]:
def get_datetime(time_stamp):
    date = datetime.datetime.strptime(time_stamp,'%Y-%m-%d %H:%M:%S')
    weekday = date.weekday()
    hour = date.hour + date.minute/60
    return weekday,hour

def make_features():
    with open('TiktokData.json','r') as json_file:
        json_list = list(json_file)
    json_file.close()
    
    video_list = []
    video_set = set()
    for json_str in json_list:
        video = json.loads(json_str)
        videoID = video['videoId']
        if videoID not in video_set:
            video_list.append(video)
            video_set.add(videoID)
            
    followers = np.array([video['followerCount'] for video in video_list])
    likes = np.array([video['likes'] for video in video_list])
    time_stamps = np.array([get_datetime(video['createDate']) for video in video_list])
    
    hashtags = []
    for video in video_list:
        hashtags.append(' '.join(video['hashtags']))
    
    countvec = CountVectorizer(strip_accents='ascii',
                           ngram_range = (1,1))
    count_data = countvec.fit_transform(hashtags).toarray()
    
    followers = np.log10(followers)
    
    n_records = len(video_list)
    followers = followers.reshape(n_records,1)
    features = np.c_[followers,time_stamps,count_data]
    df = pd.DataFrame(features)
    return df
    
df = make_features()
display(df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5040,5041,5042,5043,5044,5045,5046,5047,5048,5049
0,5.349666,6.0,11.983333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.790707,4.0,23.916667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.086360,2.0,15.783333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5.415808,6.0,15.400000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6.414973,2.0,12.600000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1862,6.278754,4.0,10.250000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1863,6.255273,1.0,0.933333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1864,5.971601,1.0,11.133333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1865,5.203305,3.0,13.233333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
