In [7]:
import json
import pandas as pd
from urllib.parse import urlparse
import matplotlib.pyplot as plt

In [35]:
def getID(url):
    """
    Helper Function: Given an url string, returns the video id embedded in the url
    """
    parsed_url = urlparse(url)
    path_segments = parsed_url.path.split('/')
    return int(path_segments[3])

def processJson(jsond): #for Sec2Gr2 files
    """
    Helper Function: Given a json datafile name, return the file in dataframe format 
    """
    jsondf = jsond
    jsondf['video_id'] = jsondf['Link'].apply(getID)
    return jsondf

def processJson2(filename): #for Sec1Gr1 files
    with open(filename, 'r') as file:
        jsondata = json.load(file)

    jsondict = {'Date':[],'Link':[],'video_id':[]}
    for video in jsondata['data']:
        jsondict['Link'].append(video[1])
        jsondict['video_id'].append(getID(video[1])) 
        jsondict['Date'].append(video[0])

    jsondf = pd.DataFrame(jsondict)
    return jsondf
    
    
    

### Check which json files contain relevant months

In [37]:
sec1files = ["Sec1Gr1_10824.json","Sec1Gr1_12345.json","Sec1Gr1_50405.json"]
sec1grp3file = "Sec1Gr3_11111.json"
sec2files = ["Sec2Gr2_26301.json","Sec2Gr2_33534.json","Sec2Gr2_38129.json","Sec2Gr2_69117.json"]

In [40]:
for file in sec2files:
    print(file)
    jsond = pd.read_json(file)
    jsonfile = processJson(jsond)
    jsonfile['Date'] = pd.to_datetime(jsonfile['Date'])
    monthlycounts = jsonfile.groupby(jsonfile['Date'].dt.to_period('M')).size()
    print(monthlycounts)
    print()

Sec2Gr2_26301.json
Date
2023-09     6400
2023-10    11335
2023-11     6786
2023-12    15406
2024-01    17619
2024-02    19742
2024-03     5083
Freq: M, dtype: int64

Sec2Gr2_33534.json
Date
2023-09    5075
2023-10    5652
2023-11    7317
2023-12    8746
2024-01    9758
2024-02    3706
Freq: M, dtype: int64

Sec2Gr2_38129.json
Date
2024-02    25
2024-03     7
Freq: M, dtype: int64

Sec2Gr2_69117.json
Date
2024-02    141
Freq: M, dtype: int64



In [30]:
for file in sec1files:
    print(file)
    jsonfile = processJson2(file)
    jsonfile['Date'] = pd.to_datetime(jsonfile['Date'])
    monthlycounts = jsonfile.groupby(jsonfile['Date'].dt.to_period('M')).size()
    print(monthlycounts)
    print()

Sec1Gr1_10824.json
Date
2023-11    14114
2023-12     2940
2024-02     9213
Freq: M, dtype: int64

Sec1Gr1_12345.json
Date
2023-09    2985
2023-11    1964
2023-12    5558
2024-01    5299
2024-02    4038
2024-03     777
Freq: M, dtype: int64

Sec1Gr1_50405.json
Date
2023-08    10053
2023-09     1044
2023-10     1540
2023-11     1003
2023-12     6047
2024-01     6295
2024-02      135
Freq: M, dtype: int64



### Merge original json and csv files' dataframes

#### sample:

In [59]:
jsondf = processJson2("Sec1Gr1_50405.json")

#jsond = pd.read_json("Sec2Gr2_33534.json")
#jsondf = processJson(jsond)

jsondf.head()

Unnamed: 0,Date,Link,video_id
0,2023-11-18 01:27:59,https://www.tiktokv.com/share/video/7298064289...,7298064289510804779
1,2023-11-18 01:26:28,https://www.tiktokv.com/share/video/7302601347...,7302601347327905070
2,2024-02-01 03:08:00,https://www.tiktokv.com/share/video/7326753111...,7326753111513632043
3,2024-02-01 03:07:35,https://www.tiktokv.com/share/video/7325244024...,7325244024623746346
4,2024-02-01 03:07:29,https://www.tiktokv.com/share/video/7330343097...,7330343097244618026


In [60]:
csvdf = pd.read_csv("Sec1Gr1_50405.csv")
#csvdf = pd.read_csv("Sec2Gr2_33534.csv")
csvdf.head()

Unnamed: 0,video_id,video_timestamp,video_duration,video_locationcreated,suggested_words,video_diggcount,video_sharecount,video_commentcount,video_playcount,video_description,video_is_ad,video_stickers,author_username,author_name,author_followercount,author_followingcount,author_heartcount,author_videocount,author_diggcount,author_verified
0,7302601347327905070,2023-11-17T20:03:14,66.0,US,,19400.0,40.0,86.0,147000.0,Talk about a humbling experience. Got a dress ...,False,,divineontheroad,Sydney Ferbrache,,,,,,False
1,7326753111513632043,2024-01-21T22:04:30,13.0,US,"yeslydimate, yesly and tony, I Look Like A Pen...",395800.0,525.0,259.0,3300000.0,we’ve reached full mini bob stage,False,,yeslydimate,yeslydimate,,,,,,False
2,7325244024623746346,2024-01-17T20:28:27,11.0,US,"yeslydimate, tony and yesly, Yesly, yesly sure...",828300.0,2128.0,750.0,7100000.0,,False,,yeslydimate,yeslydimate,,,,,,False
3,7330343097244618026,2024-01-31T14:15:49,85.0,US,,56000.0,8358.0,1994.0,1500000.0,"#stitch with @Casey Smith If it works for you,...",False,,_sydneymoran,_sydneymoran,,,,,,False
4,7329927237359193386,2024-01-30T11:21:56,69.0,US,"jenna marbles, Neon Genesis Evangelion, doorkn...",199900.0,2854.0,1352.0,1200000.0,Opportunity,False,,mburkez,Doorknob Girl,,,,,,False


In [61]:
result = jsondf.merge(csvdf, on=['video_id'])
result['Date'] = pd.to_datetime(result['Date'])
monthlycounts = result.groupby(result['Date'].dt.to_period('M')).size()
print(monthlycounts)
print()

Date
2023-08    10459
2023-09     1176
2023-10     1550
2023-11     1080
2023-12     6637
2024-01     6694
2024-02      127
Freq: M, dtype: int64



In [62]:
result.to_csv("merged_50405.csv")

In [47]:
## look at the videos in the json
jsondf['Date'] = pd.to_datetime(jsondf['Date'])
monthlycounts = jsondf.groupby(jsondf['Date'].dt.to_period('M')).size()
print(monthlycounts)

Date
2023-09    5075
2023-10    5652
2023-11    7317
2023-12    8746
2024-01    9758
2024-02    3706
Freq: M, dtype: int64


In [48]:
jsondf['Date'] = pd.to_datetime(jsondf['Date'])
jsondf.drop_duplicates(subset=['video_id'], inplace = True)
monthlycounts = jsondf.groupby(jsondf['Date'].dt.to_period('M')).size()
print(monthlycounts)

Date
2023-09    4726
2023-10    5268
2023-11    6875
2023-12    8227
2024-01    9337
2024-02    3583
Freq: M, dtype: int64
