Before our analysis we collected several datasests obtained from the src.data.make_datasets.py script.
In this notebook we merge all of them  into one in order to carry our analysis with it.

In [56]:
import os
import pandas as pd
import numpy as np
from ast import literal_eval

# datasets of streamers info
data_folder = "../data"
datasets = [
    "auron.csv",
    "data_ibai.csv",
    "jaimeAltozano.csv",
    "rubius.csv",
    "data_asmr.csv"
]

# Merge all the datasets of streamers into one
dfs = []
for dataset_name in datasets:
    df = pd.read_csv(os.path.join(data_folder,dataset_name),lineterminator='\n')
    df.columns = df.columns.str.replace('\r','')
    dfs.append(df)

df_streamers = pd.concat(dfs)

# Remove duplicates and preprocess
df_streamers = df_streamers.sort_values(["user_follows","num_followers","view_count"],ascending=False)\
    .reset_index(drop=True).drop_duplicates(subset=['id'],keep='first')\
        .sort_values(["num_followers","view_count"],ascending=False)\
            .astype({"id":str,"created_at":"datetime64"})

# clean user_follows and convert each list to array
df_streamers["user_follows"] = df_streamers.user_follows.\
                                replace("\r", "", regex=True)\
                                    .str.strip().replace("",None)\
                                        .replace(pd.NA,"None")\
                                            .apply(literal_eval)


# remove those ids in user_follows that are not in df_streamers.id by exploding the dataframe
df_streamers_exploded = df_streamers.explode("user_follows")
df_streamers_exploded = df_streamers_exploded[df_streamers_exploded["user_follows"].isin(df_streamers.id)]
user_follows_arrays = df_streamers_exploded.groupby("id").user_follows.apply(list).reset_index()
df_streamers = df_streamers.set_index("id")
df_streamers.loc[user_follows_arrays.id,"user_follows"] = user_follows_arrays.user_follows.values

# Make those that appear as nan or are not in the index to be empty arrays
not_in_set_or_null = ~df_streamers.index.isin(user_follows_arrays.id)
df_streamers.loc[not_in_set_or_null,"user_follows"] =  pd.Series([[]]*not_in_set_or_null.sum()).values
df_streamers

Unnamed: 0_level_0,name,num_followers,broadcaster_type,description,lang,last_game_played_name,view_count,profile_image_url,created_at,user_follows
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
459331509,auronplay,10885764.0,partner,feliz pero no mucho,es,Just Chatting,212445226,https://static-cdn.jtvnw.net/jtv_user_pictures...,2019-09-03 14:02:49,"[210708721, 431460701, 77649106, 130065491, 42..."
39276140,Rubius,10499836.0,partner,se contar hasta patata,es,Fortnite,223159671,https://static-cdn.jtvnw.net/jtv_user_pictures...,2013-01-12 18:46:56,"[31478096, 91136321, 198363811, 210708721, 133..."
48878319,TheGrefg,8734633.0,partner,"Hola, me llamo David, me quedé calvo delante d...",es,Just Chatting,199223874,https://static-cdn.jtvnw.net/jtv_user_pictures...,2013-09-12 00:45:27,"[143776262, 675347177, 33734881, 248222879, 18..."
83232866,ibai,8543962.0,partner,Si lees esto que sepas que te aprecio,es,Sports,287415258,https://static-cdn.jtvnw.net/jtv_user_pictures...,2015-02-20 16:47:56,"[35980866, 145908612, 205218019, 91136321, 524..."
121510236,juansguarnizo,6027436.0,partner,"Juan Sebastián Guarnizo Algarra, más conocido ...",es,Minecraft,128134942,https://static-cdn.jtvnw.net/jtv_user_pictures...,2016-04-11 03:34:11,"[43419527, 517536651, 476005292, 109492660, 55..."
...,...,...,...,...,...,...,...,...,...,...
246096970,alejandrofutbolista,,,A jugar!!,es,Grand Theft Auto V,1,https://static-cdn.jtvnw.net/user-default-pict...,2018-08-07 02:27:26,[]
223850627,miiigueel04,,,,es,FIFA 21,1,https://static-cdn.jtvnw.net/jtv_user_pictures...,2018-05-17 17:17:12,[]
560453787,coraje_rchiflado,,,24 🍯Amateur,es,Starlit Adventures,1,https://static-cdn.jtvnw.net/user-default-pict...,2020-07-30 22:22:50,[]
578613648,futbolnelson,,,,es,Among Us,1,https://static-cdn.jtvnw.net/jtv_user_pictures...,2020-09-05 06:50:24,[]


In [7]:
df_streamers.reset_index().to_feather("../data/streamers.feather")

In [58]:
df = pd.read_feather("../data/streamers.feather")