Add imports

In [101]:
import os
import pandas as pd
import re

Read raw output .csv's from obtained by crawling. Add new columns `Search word` and `Most watched`

In [102]:
folder_path = "../outputs/"
dataframes = []
pattern = r"(.+?)(?:_(most_watched))?\.csv"

for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):

        match = re.search(pattern, filename)
        if match:
            word = match.group(1)
            most_watched = match.group(2) is not None
            file_path = os.path.join(folder_path, filename)
            file_df = pd.read_csv(file_path)
            file_df["Search word"] = word
            file_df["Most watched"] = most_watched
            dataframes.append(file_df)

df = pd.concat(dataframes, ignore_index=True)
df.head()

Unnamed: 0,Video Id,Title,View Count,Length,Channel,Search word,Most watched
0,ssFgYNn0BUA,Mallrat - Groceries (Official Video),2 900 865 katselukertaa,3.35,/channel/UCK2codDA94XHf66dbfSy6vw,groceries,False
1,34djOMgVq-0,Pi'erre Bourne - Groceries (Official Music Video),1 644 500 katselukertaa,4.1,/channel/UCN27za3wzItmyRV2O3N91vw,groceries,False
2,ha4tRQwKIUg,I Bought Everything In A Store - Challenge,144 516 730 katselukertaa,16.56,/@MrBeast,groceries,False
3,8oZNzedMhUo,"Roommate WON'T BUY Groceries, What Happens Is ...",20 756 555 katselukertaa,8.14,/@DharMann,groceries,False
4,WClN_VrYosc,HOW WE WENT BROKE GROCERY SHOPPING FOR FOOD **...,721 509 katselukertaa,40.56,/@TheOfficialPrinceFamily,groceries,False


In [103]:
df_copy = df.copy()

Remove NaN's and create new column `Views` as type `int`

In [104]:
df.dropna(subset=["View Count", "Length"], inplace=True)
df["Views"] = df["View Count"].str.extract("(\d+(?:\s\d+)*)").replace(r"\s+", "", regex=True).fillna("0").astype(int)
df.describe()

Unnamed: 0,Views
count,737419.0
mean,50843560.0
std,215845600.0
min,0.0
25%,208541.5
50%,3095021.0
75%,26333100.0
max,13355170000.0


Change `Search word` and `Most watched` types

In [105]:
df["Search word"] = df["Search word"].apply(lambda x: [x])
df["Most watched"] = df["Most watched"].map({True: "Y", False: "N"})
df.head()

Unnamed: 0,Video Id,Title,View Count,Length,Channel,Search word,Most watched,Views
0,ssFgYNn0BUA,Mallrat - Groceries (Official Video),2 900 865 katselukertaa,3.35,/channel/UCK2codDA94XHf66dbfSy6vw,[groceries],N,2900865
1,34djOMgVq-0,Pi'erre Bourne - Groceries (Official Music Video),1 644 500 katselukertaa,4.1,/channel/UCN27za3wzItmyRV2O3N91vw,[groceries],N,1644500
2,ha4tRQwKIUg,I Bought Everything In A Store - Challenge,144 516 730 katselukertaa,16.56,/@MrBeast,[groceries],N,144516730
3,8oZNzedMhUo,"Roommate WON'T BUY Groceries, What Happens Is ...",20 756 555 katselukertaa,8.14,/@DharMann,[groceries],N,20756555
4,WClN_VrYosc,HOW WE WENT BROKE GROCERY SHOPPING FOR FOOD **...,721 509 katselukertaa,40.56,/@TheOfficialPrinceFamily,[groceries],N,721509


Group by `Video Id` and aggregate `Search words` and `Most watched` information

In [106]:
def append_words(series):
    return list(set([item for sublist in series for item in sublist]))

def append_search_type(series):
    return set(series)

aggregations = {
    "Title": "first",
    "Views": "first",
    "Length": "first",
    "Channel": "first",
    "Search word": append_words,
    "Most watched": append_search_type
}

df = df.groupby("Video Id").agg(aggregations).reset_index()

Convert `Most watched` into three categorial values: ```Y = Most watched | N = Normal | B = Both```

In [120]:
def convert_most_watched(values):
    if values == {"N", "Y"}:
        return "B"
    elif "Y" in values:
        return "Y"
    else:
        return "N"
    
df["Most watched"] = df["Most watched"].apply(convert_most_watched).astype("category")

Check results. To be continued...

In [121]:
df.head()

Unnamed: 0,Video Id,Title,Views,Length,Channel,Search word,Most watched
0,---AKxmFRWI,Dharmesh Yalande Sir Dance Bastar Dist Chhatti...,8867396,0.27,/@bastarfiles,[dist],Y
1,---jp8dVvkY,Fed Will Likely Hike in September - Market Pul...,144,26.55,/@FXPesa,[pulse],N
2,--0HXqi8xTk,Hyundai i20 at 25% Discount,798201,0.59,/@bekifaayati,[discounted],Y
3,--0Qq5EmpGg,PSY - GANGNAM STYLE (CONVERSELY),424,4.13,/@maxchet,[conversely],N
4,--14w5SOEUs,Migos - Avalanche (Official Video),21229658,3.59,/channel/UC9YcTIQuhwgoOQqYMKYqW9A,[avalanche],Y


In [122]:
df.tail()

Unnamed: 0,Video Id,Title,Views,Length,Channel,Search word,Most watched
524554,zzwWzLY7kEo,NewJeans (뉴진스) 'ETA' Dance Practice,3131516,2.39,/channel/UCMki_UkHb4qSc0qyEcOHHJw,[eta],N
524555,zzwXTkQ9n0k,Latest trend with Horse chestnut #asmr #oddlys...,68243230,0.19,/@hoofmaestro1130,[chestnut],Y
524556,zzwmEo-mzDc,Meryl - La Brume ft. Le Motif,5000660,3.13,/@MERYLytchn,[motif],Y
524557,zzyDz6jhX30,All I Ever Need - Austin Mahone (Lyrics),13613316,3.34,/@SuperbLyricsOfficial,[necessity],Y
524558,zzzIpC39WUg,How To Calculate Speeds and Feeds (Inch Versio...,267679,14.26,/@haasautomation,[feeds],N
