In [58]:
# Tratamiento de datos
# ------------------------------------------------------------------------------
import numpy as np
import pandas as pd

# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Modelado y evaluación
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

# Configuración warnings
# ------------------------------------------------------------------------------
import warnings
warnings.filterwarnings('once')
from scipy import stats
import math 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

Hemos elegido dos datasets sobre el top de influencers de instagram.


In [59]:
df= pd.read_csv('datos/top_insta_influencers_data.csv')
df.reset_index(drop= True, inplace= True)
df.head()


Unnamed: 0,rank,channel_info,influence_score,posts,followers,avg_likes,60_day_eng_rate,new_post_avg_like,total_likes,country
0,1,cristiano,92,3.3k,475.8m,8.7m,1.39%,6.5m,29.0b,Spain
1,2,kyliejenner,91,6.9k,366.2m,8.3m,1.62%,5.9m,57.4b,United States
2,3,leomessi,90,0.89k,357.3m,6.8m,1.24%,4.4m,6.0b,
3,4,selenagomez,93,1.8k,342.7m,6.2m,0.97%,3.3m,11.5b,United States
4,5,therock,91,6.8k,334.1m,1.9m,0.20%,665.3k,12.5b,United States


In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   rank               200 non-null    int64 
 1   channel_info       200 non-null    object
 2   influence_score    200 non-null    int64 
 3   posts              200 non-null    object
 4   followers          200 non-null    object
 5   avg_likes          200 non-null    object
 6   60_day_eng_rate    200 non-null    object
 7   new_post_avg_like  200 non-null    object
 8   total_likes        200 non-null    object
 9   country            138 non-null    object
dtypes: int64(2), object(8)
memory usage: 15.8+ KB


In [61]:
df1 = pd.read_csv('datos/instagram_global_top_1000.csv')
df1.head()


Unnamed: 0,Country,Rank,Account,Title,Link,Category,Followers,Audience Country,Authentic engagement,Engagement avg,Scraped
0,All,1,cristiano,Cristiano Ronaldo,https://www.instagram.com/cristiano/,Sports with a ball,400100000.0,India,7800000.0,9500000.0,2022-02-07 16:50:24.798803
1,All,2,kyliejenner,Kylie 🤍,https://www.instagram.com/kyliejenner/,Fashion|Modeling|Beauty,308800000.0,United States,6200000.0,10100000.0,2022-02-07 16:50:24.798803
2,All,3,leomessi,Leo Messi,https://www.instagram.com/leomessi/,Sports with a ball|Family,306300000.0,Argentina,4800000.0,6500000.0,2022-02-07 16:50:24.798803
3,All,4,kendalljenner,Kendall,https://www.instagram.com/kendalljenner/,Modeling|Fashion,217800000.0,United States,3400000.0,5400000.0,2022-02-07 16:50:24.798803
4,All,5,selenagomez,Selena Gomez,https://www.instagram.com/selenagomez/,Music|Lifestyle,295800000.0,United States,2700000.0,3600000.0,2022-02-07 16:50:24.798803


In [62]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Country               1000 non-null   object 
 1   Rank                  1000 non-null   int64  
 2   Account               1000 non-null   object 
 3   Title                 983 non-null    object 
 4   Link                  1000 non-null   object 
 5   Category              909 non-null    object 
 6   Followers             1000 non-null   float64
 7   Audience Country      993 non-null    object 
 8   Authentic engagement  1000 non-null   float64
 9   Engagement avg        1000 non-null   float64
 10  Scraped               1000 non-null   object 
dtypes: float64(3), int64(1), object(7)
memory usage: 86.1+ KB


Vamos a seleccionar varias columnas del segundo dataset, y luego uniremos esas columnas junto con el primer dataset.

In [63]:
df1 = df1[['Account','Link', 'Category', 'Audience Country']]
df1.reset_index(drop= True, inplace= True)
df1.head()

Unnamed: 0,Account,Link,Category,Audience Country
0,cristiano,https://www.instagram.com/cristiano/,Sports with a ball,India
1,kyliejenner,https://www.instagram.com/kyliejenner/,Fashion|Modeling|Beauty,United States
2,leomessi,https://www.instagram.com/leomessi/,Sports with a ball|Family,Argentina
3,kendalljenner,https://www.instagram.com/kendalljenner/,Modeling|Fashion,United States
4,selenagomez,https://www.instagram.com/selenagomez/,Music|Lifestyle,United States


In [64]:
df_completo= df.merge(df1, left_on= 'channel_info', right_on = "Account")
df_completo.head()

Unnamed: 0,rank,channel_info,influence_score,posts,followers,avg_likes,60_day_eng_rate,new_post_avg_like,total_likes,country,Account,Link,Category,Audience Country
0,1,cristiano,92,3.3k,475.8m,8.7m,1.39%,6.5m,29.0b,Spain,cristiano,https://www.instagram.com/cristiano/,Sports with a ball,India
1,2,kyliejenner,91,6.9k,366.2m,8.3m,1.62%,5.9m,57.4b,United States,kyliejenner,https://www.instagram.com/kyliejenner/,Fashion|Modeling|Beauty,United States
2,3,leomessi,90,0.89k,357.3m,6.8m,1.24%,4.4m,6.0b,,leomessi,https://www.instagram.com/leomessi/,Sports with a ball|Family,Argentina
3,4,selenagomez,93,1.8k,342.7m,6.2m,0.97%,3.3m,11.5b,United States,selenagomez,https://www.instagram.com/selenagomez/,Music|Lifestyle,United States
4,5,therock,91,6.8k,334.1m,1.9m,0.20%,665.3k,12.5b,United States,therock,https://www.instagram.com/therock/,Cinema|Actors/actresses|Fitness|Gym,India


In [65]:
df_completo.shape

(172, 14)

In [66]:
df_completo.isnull().sum()

rank                  0
channel_info          0
influence_score       0
posts                 0
followers             0
avg_likes             0
60_day_eng_rate       0
new_post_avg_like     0
total_likes           0
country              55
Account               0
Link                  0
Category              2
Audience Country      2
dtype: int64

In [67]:
df_completo.head(5)

Unnamed: 0,rank,channel_info,influence_score,posts,followers,avg_likes,60_day_eng_rate,new_post_avg_like,total_likes,country,Account,Link,Category,Audience Country
0,1,cristiano,92,3.3k,475.8m,8.7m,1.39%,6.5m,29.0b,Spain,cristiano,https://www.instagram.com/cristiano/,Sports with a ball,India
1,2,kyliejenner,91,6.9k,366.2m,8.3m,1.62%,5.9m,57.4b,United States,kyliejenner,https://www.instagram.com/kyliejenner/,Fashion|Modeling|Beauty,United States
2,3,leomessi,90,0.89k,357.3m,6.8m,1.24%,4.4m,6.0b,,leomessi,https://www.instagram.com/leomessi/,Sports with a ball|Family,Argentina
3,4,selenagomez,93,1.8k,342.7m,6.2m,0.97%,3.3m,11.5b,United States,selenagomez,https://www.instagram.com/selenagomez/,Music|Lifestyle,United States
4,5,therock,91,6.8k,334.1m,1.9m,0.20%,665.3k,12.5b,United States,therock,https://www.instagram.com/therock/,Cinema|Actors/actresses|Fitness|Gym,India


In [68]:
df_completo.drop(['Account'], axis=1, inplace= True)


In [69]:
df_completo.head()

Unnamed: 0,rank,channel_info,influence_score,posts,followers,avg_likes,60_day_eng_rate,new_post_avg_like,total_likes,country,Link,Category,Audience Country
0,1,cristiano,92,3.3k,475.8m,8.7m,1.39%,6.5m,29.0b,Spain,https://www.instagram.com/cristiano/,Sports with a ball,India
1,2,kyliejenner,91,6.9k,366.2m,8.3m,1.62%,5.9m,57.4b,United States,https://www.instagram.com/kyliejenner/,Fashion|Modeling|Beauty,United States
2,3,leomessi,90,0.89k,357.3m,6.8m,1.24%,4.4m,6.0b,,https://www.instagram.com/leomessi/,Sports with a ball|Family,Argentina
3,4,selenagomez,93,1.8k,342.7m,6.2m,0.97%,3.3m,11.5b,United States,https://www.instagram.com/selenagomez/,Music|Lifestyle,United States
4,5,therock,91,6.8k,334.1m,1.9m,0.20%,665.3k,12.5b,United States,https://www.instagram.com/therock/,Cinema|Actors/actresses|Fitness|Gym,India


In [70]:
df_completo.Category.unique()

array(['Sports with a ball', 'Fashion|Modeling|Beauty',
       'Sports with a ball|Family', 'Music|Lifestyle',
       'Cinema|Actors/actresses|Fitness|Gym', 'Fashion|Beauty', 'Music',
       'Music|Fashion', 'Clothing|Outfits|Lifestyle', 'Modeling|Fashion',
       'Nature  landscapes|Science|Photography',
       'Fitness|Gym|Shopping|Retail|Clothing|Outfits',
       'Music|Cinema|Actors/actresses', 'Lifestyle',
       'Cinema|Actors/actresses', 'Cinema|Actors/actresses|Fashion',
       'Music|Beauty', 'Shows|Cinema|Actors/actresses',
       'Science|Photography', 'Lifestyle|Music', 'Modeling|Lifestyle',
       'Family|Sports with a ball', 'Finance|Economics|Business|Careers',
       'Literature|Journalism|Cinema|Actors/actresses|Fashion', 'Beauty',
       'Cinema|Actors/actresses|Comics  sketches|Shows',
       'Cinema|Actors/actresses|Music', 'Shows', 'Humor|Fun|Happiness',
       'Modeling', 'Cinema|Actors/actresses|Family',
       'Cinema|Actors/actresses|Art|Artists',
       'Beaut

In [71]:
len(df_completo.Category.unique())

60

In [72]:
df_completo = df_completo.assign(Category=df_completo.Category.str.split("|")).explode('Category')

In [73]:
df_completo.head(10)

Unnamed: 0,rank,channel_info,influence_score,posts,followers,avg_likes,60_day_eng_rate,new_post_avg_like,total_likes,country,Link,Category,Audience Country
0,1,cristiano,92,3.3k,475.8m,8.7m,1.39%,6.5m,29.0b,Spain,https://www.instagram.com/cristiano/,Sports with a ball,India
1,2,kyliejenner,91,6.9k,366.2m,8.3m,1.62%,5.9m,57.4b,United States,https://www.instagram.com/kyliejenner/,Fashion,United States
1,2,kyliejenner,91,6.9k,366.2m,8.3m,1.62%,5.9m,57.4b,United States,https://www.instagram.com/kyliejenner/,Modeling,United States
1,2,kyliejenner,91,6.9k,366.2m,8.3m,1.62%,5.9m,57.4b,United States,https://www.instagram.com/kyliejenner/,Beauty,United States
2,3,leomessi,90,0.89k,357.3m,6.8m,1.24%,4.4m,6.0b,,https://www.instagram.com/leomessi/,Sports with a ball,Argentina
2,3,leomessi,90,0.89k,357.3m,6.8m,1.24%,4.4m,6.0b,,https://www.instagram.com/leomessi/,Family,Argentina
3,4,selenagomez,93,1.8k,342.7m,6.2m,0.97%,3.3m,11.5b,United States,https://www.instagram.com/selenagomez/,Music,United States
3,4,selenagomez,93,1.8k,342.7m,6.2m,0.97%,3.3m,11.5b,United States,https://www.instagram.com/selenagomez/,Lifestyle,United States
4,5,therock,91,6.8k,334.1m,1.9m,0.20%,665.3k,12.5b,United States,https://www.instagram.com/therock/,Cinema,India
4,5,therock,91,6.8k,334.1m,1.9m,0.20%,665.3k,12.5b,United States,https://www.instagram.com/therock/,Actors/actresses,India


In [74]:
df_completo.Category.unique()

array(['Sports with a ball', 'Fashion', 'Modeling', 'Beauty', 'Family',
       'Music', 'Lifestyle', 'Cinema', 'Actors/actresses', 'Fitness',
       'Gym', 'Clothing', 'Outfits', 'Nature  landscapes', 'Science',
       'Photography', 'Shopping', 'Retail', 'Shows', 'Finance',
       'Economics', 'Business', 'Careers', 'Literature', 'Journalism',
       'Comics  sketches', 'Humor', 'Fun', 'Happiness', 'Art', 'Artists',
       nan, 'Accessories', 'Jewellery', 'Food', 'Cooking', 'Travel',
       'Cars', 'Motorbikes', 'Luxury'], dtype=object)

In [75]:
df_completo.to_csv('datos/categorias_instagram.csv')

In [76]:
df_completo.posts.unique()

array(['3.3k', '6.9k', '0.89k', '1.8k', '6.8k', '5.6k', '5.0k', '2.0k',
       '4.1k', '7.4k', '0.66k', '10.0k', '0.95k', '3.2k', '1.4k', '6.4k',
       '4.4k', '1.2k', '5.3k', '8.2k', '3.5k', '1.6k', '0.08k', '4.8k',
       '2.3k', '5.2k', '7.3k', '0.69k', '1.3k', '1.7k', '3.6k', '0.87k',
       '1.5k', '1.9k', '12.9k', '2.5k', '0.86k', '0.54k', '0.26k', '2.9k',
       '0.34k', '1.0k', '7.0k', '17.5k', '5.1k', '0.83k', '2.4k', '0.82k',
       '9.9k', '2.7k', '0.28k', '0.88k', '4.6k', '2.1k', '2.2k', '0.84k',
       '10.3k', '0.42k', '0.60k', '1.1k', '0.06k', '0.15k', '8.9k',
       '6.5k', '0.17k', '0.98k', '0.59k', '4.2k', '0.16k', '16.9k',
       '0.68k', '0.97k', '4.9k', '0.58k', '3.0k', '0.01k', '0.67k',
       '0.09k', '0.30k', '0.73k', '13.4k', '0.50k', '0.11k', '0.03k',
       '0.94k', '6.3k', '9.3k', '4.5k', '0.37k', '3.8k'], dtype=object)

In [78]:
column = ['posts', 'avg_likes', 'new_post_avg_like']

for i in column: 
    df_completo[i] = df_completo[i].str.replace("k", "")

In [79]:
column = ['total_likes']

for i in column: 
    df_completo[i] = df_completo[i].str.replace("b", "")

In [85]:
column = ['total_likes']

for i in column: 
    df_completo[i] = df_completo[i].str.replace("m", "")

In [80]:
column = ['followers', 'avg_likes', 'new_post_avg_like']

for i in column: 
    df_completo[i] = df_completo[i].str.replace("m", "")

In [86]:
df_completo.head(5)

Unnamed: 0,rank,channel_info,influence_score,posts,followers,avg_likes,60_day_eng_rate,new_post_avg_like,total_likes,country,Link,Category,Audience Country
0,1,cristiano,92,3.3,475.8,8.7,1.39%,6.5,29.0,Spain,https://www.instagram.com/cristiano/,Sports with a ball,India
1,2,kyliejenner,91,6.9,366.2,8.3,1.62%,5.9,57.4,United States,https://www.instagram.com/kyliejenner/,Fashion,United States
1,2,kyliejenner,91,6.9,366.2,8.3,1.62%,5.9,57.4,United States,https://www.instagram.com/kyliejenner/,Modeling,United States
1,2,kyliejenner,91,6.9,366.2,8.3,1.62%,5.9,57.4,United States,https://www.instagram.com/kyliejenner/,Beauty,United States
2,3,leomessi,90,0.89,357.3,6.8,1.24%,4.4,6.0,,https://www.instagram.com/leomessi/,Sports with a ball,Argentina


In [87]:
df_completo.total_likes.unique()

array(['29.0', '57.4', '6.0', '11.5', '12.5', '19.9', '18.4', '7.4',
       '9.8', '13.9', '3.7', '3.0', '313.6', '5.3', '4.9', '13.5', '7.7',
       '1.6', '14.1', '1.5', '4.3', '20.6', '5.0', '91.3', '17.9', '4.2',
       '6.8', '9.0', '3.4', '11.6', '5.9', '2.6', '2.3', '2.5', '5.6',
       '5.1', '2.0', '8.2', '1.9', '2.8', '3.5', '4.8', '8.8', '4.4',
       '3.3', '419.0', '6.6', '639.0', '1.7', '5.4', '3.8', '2.9', '2.2',
       '6.7', '1.1', '1.8', '10.1', '1.3', '669.3', '3.6', '8.9', '453.6',
       '421.7', '2.4', '987.4', '6.1', '654.4', '1.0', '824.0', '283.0',
       '753.1', '2.7', '773.5', '3.2', '1.2', '925.7', '820.1', '782.5',
       '949.9', '710.7', '67.5', '767.0', '452.0', '957.9', '500.5',
       '563.9', '1.4', '199.8', '594.7', '672.1', '955.4', '730.1',
       '18.7', '739.5', '810.0', '561.3', '21.2', '4.6', '3.1', '391.9',
       '969.1'], dtype=object)

In [83]:
df_completo.dtypes

rank                  int64
channel_info         object
influence_score       int64
posts                object
followers            object
avg_likes            object
60_day_eng_rate      object
new_post_avg_like    object
total_likes          object
country              object
Link                 object
Category             object
Audience Country     object
dtype: object

Transformamos algunas columnas a float.

In [88]:
df_completo = df_completo.astype({'posts':'float64', 'followers':'float64', 'avg_likes':'float64', 'new_post_avg_like':'float64', 'total_likes': 'float64'})

In [89]:
df_completo.dtypes

rank                   int64
channel_info          object
influence_score        int64
posts                float64
followers            float64
avg_likes            float64
60_day_eng_rate       object
new_post_avg_like    float64
total_likes          float64
country               object
Link                  object
Category              object
Audience Country      object
dtype: object

Renombramos algunas columnas.

In [90]:
df_completo.rename(columns={'channel_info':'influencer', 'Link':'link_ig', 'Category':'category', 'Audience Country':'audience_country'}, inplace = True)

In [91]:
df_completo.head(2)

Unnamed: 0,rank,influencer,influence_score,posts,followers,avg_likes,60_day_eng_rate,new_post_avg_like,total_likes,country,link_ig,category,audience_country
0,1,cristiano,92,3.3,475.8,8.7,1.39%,6.5,29.0,Spain,https://www.instagram.com/cristiano/,Sports with a ball,India
1,2,kyliejenner,91,6.9,366.2,8.3,1.62%,5.9,57.4,United States,https://www.instagram.com/kyliejenner/,Fashion,United States


In [92]:
df_completo.to_csv('datos/top_influencers.csv')

In [None]:
# Ojo dataset categorías
# total likes