Before our analysis we collected several datasests obtained from the src.data.make_datasets.py script.
In this notebook we merge all of them  into one in order to carry our analysis with it.

In [46]:
import os
import pandas as pd
import numpy as np
from ast import literal_eval

# datasets of streamers info
data_folder = "../data"
datasets = [
    "auron.csv",
    "data_ibai.csv",
    "jaimeAltozano.csv",
    "rubius.csv",
]

# Merge all the datasets of streamers into one
dfs = []
for dataset_name in datasets:
    df = pd.read_csv(os.path.join(data_folder,dataset_name),lineterminator='\n')
    dfs.append(df)

df_streamers = pd.concat(dfs)

# Remove duplicates and preprocess
df_streamers = df_streamers.sort_values(["num_followers","view_count"])\
    .reset_index(drop=True).drop_duplicates(subset=['id'],keep='first')\
        .sort_values(["num_followers","view_count"],ascending=False)\
            .astype({"id":str,"created_at":"datetime64"})

# clean user_follows and convert each list to array
df_streamers["user_follows"] = df_streamers.user_follows.\
                                replace("\r", "", regex=True)\
                                    .str.strip().replace("",None)\
                                        .replace(pd.NA,"None")\
                                            .apply(literal_eval)


# remove those ids in user_follows that are not in df_streamers.id by exploding the dataframe
df_streamers_exploded = df_streamers.explode("user_follows")
df_streamers_exploded = df_streamers_exploded[df_streamers_exploded["user_follows"].isin(df_streamers.id)]
user_follows_arrays = df_streamers_exploded.groupby("id").user_follows.apply(list).reset_index()
df_streamers = df_streamers.set_index("id")
df_streamers.loc[user_follows_arrays.id,"user_follows"] = user_follows_arrays.user_follows.values

# Make those that appear as nan or are not in the index to be empty arrays
not_in_set_or_null = ~df_streamers.index.isin(user_follows_arrays.id)
df_streamers.loc[not_in_set_or_null,"user_follows"] =  pd.Series([[]]*not_in_set_or_null.sum()).values

In [47]:
df_streamers.reset_index().to_feather("../data/streamers.feather")

In [48]:
pd.read_feather("../data/streamers.feather")

Unnamed: 0,id,name,num_followers,broadcaster_type,description,lang,last_game_played_name,view_count,profile_image_url,created_at,user_follows
0,459331509,auronplay,10874205.0,partner,feliz pero no mucho,es,Minecraft,212166590,https://static-cdn.jtvnw.net/jtv_user_pictures...,2019-09-03 14:02:49,"[210708721, 431460701, 77649106, 130065491, 42..."
1,39276140,Rubius,10483570.0,partner,se contar hasta patata,es,Propnight,222921576,https://static-cdn.jtvnw.net/jtv_user_pictures...,2013-01-12 18:46:56,"[91136321, 198363811, 210708721, 133528221, 70..."
2,48878319,TheGrefg,8708686.0,partner,"Hola, me llamo David, me quedé calvo delante d...",es,Five Nights at Freddy's,197916916,https://static-cdn.jtvnw.net/jtv_user_pictures...,2013-09-12 00:45:27,"[143776262, 675347177, 33734881, 248222879, 18..."
3,83232866,ibai,8536480.0,partner,Si lees esto que sepas que te aprecio,es,VALORANT,286994651,https://static-cdn.jtvnw.net/jtv_user_pictures...,2015-02-20 16:47:56,"[35980866, 145908612, 205218019, 91136321, 524..."
4,121510236,juansguarnizo,6013625.0,partner,"Juan Sebastián Guarnizo Algarra, más conocido ...",es,Just Chatting,127964660,https://static-cdn.jtvnw.net/jtv_user_pictures...,2016-04-11 03:34:11,"[43419527, 517536651, 476005292, 109492660, 55..."
...,...,...,...,...,...,...,...,...,...,...,...
75625,246096970,alejandrofutbolista,,,A jugar!!,es,Grand Theft Auto V,1,https://static-cdn.jtvnw.net/user-default-pict...,2018-08-07 02:27:26,[]
75626,223850627,miiigueel04,,,,es,FIFA 21,1,https://static-cdn.jtvnw.net/jtv_user_pictures...,2018-05-17 17:17:12,[]
75627,560453787,coraje_rchiflado,,,24 🍯Amateur,es,Starlit Adventures,1,https://static-cdn.jtvnw.net/user-default-pict...,2020-07-30 22:22:50,[]
75628,578613648,futbolnelson,,,,es,Among Us,1,https://static-cdn.jtvnw.net/jtv_user_pictures...,2020-09-05 06:50:24,[]
