In [27]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import networkx as nx

In [28]:
# Load the CSV
path = 'social media influencers-TIKTOK - ---DEC 2022 (1).csv'
df_dec = pd.read_csv(path)

# Display the first few rows
print(df_dec.head())

# Summary statistics
print(df_dec.describe())

# Check for missing values
print(df_dec.isnull().sum())

# Data types of the columns
print(df_dec.dtypes)

   Rank Tiktoker name Tiktok name followers views(avg) likes(avg.)  \
0     1       mrbeast     MrBeast     60.3M      29.2M        3.5M   
1     2        karolg     Karol G     42.4M      23.7M        3.4M   
2     3         yzn47  يزن الأسمر      8.9M      48.9M      998.4K   
3     4    centralcee  CentralCee      4.4M      19.8M        3.6M   
4     5      adinross        adin      6.1M      21.1M        3.3M   

  comments(avg.) shares(avg.)  
0          30.8K         7.2K  
1          21.7K        25.7K  
2          16.3K        60.9K  
3          23.3K        24.2K  
4          17.5K        25.3K  
              Rank
count  1000.000000
mean    500.500000
std     288.819436
min       1.000000
25%     250.750000
50%     500.500000
75%     750.250000
max    1000.000000
Rank              0
Tiktoker name     0
Tiktok name       0
followers         0
views(avg)        0
likes(avg.)       0
comments(avg.)    0
shares(avg.)      0
dtype: int64
Rank               int64
Tiktoker name     

In [29]:
#Cleaning

# Function to convert string values like '60.3M' to numerical values
def convert_to_numeric(x):
    if 'M' in x:
        return float(x.replace('M', '')) * 1e6
    elif 'K' in x:
        return float(x.replace('K', '')) * 1e3
    else:
        return float(x)

# Apply the conversion function to the relevant columns
df_dec['followers'] = df_dec['followers'].apply(convert_to_numeric)
df_dec['views(avg)'] = df_dec['views(avg)'].apply(convert_to_numeric)
df_dec['likes(avg.)'] = df_dec['likes(avg.)'].apply(convert_to_numeric)
df_dec['comments(avg.)'] = df_dec['comments(avg.)'].apply(convert_to_numeric)
df_dec['shares(avg.)'] = df_dec['shares(avg.)'].apply(convert_to_numeric)

# Check for any missing values
missing_values = df_dec.isnull().sum()

# Summary statistics for the cleaned data
summary_stats = df_dec.describe()

# Create tables for the most popular TikTokers by views, likes, comments, and shares
most_views = df_dec.nlargest(5, 'views(avg)')[['Tiktoker name', 'views(avg)']]
most_likes = df_dec.nlargest(5, 'likes(avg.)')[['Tiktoker name', 'likes(avg.)']]
most_comments = df_dec.nlargest(5, 'comments(avg.)')[['Tiktoker name', 'comments(avg.)']]
most_shares = df_dec.nlargest(5, 'shares(avg.)')[['Tiktoker name', 'shares(avg.)']]
most_followers = df_dec.nlargest(5, 'followers')[['Tiktoker name', 'followers']]


# Compiling the results
cleaning_results = {
    'missing_values': missing_values,
    'summary_stats': summary_stats,
    'most_views': most_views,
    'most_likes': most_likes,
    'most_comments': most_comments,
    'most_shares': most_shares
}

cleaning_results


{'missing_values': Rank              0
 Tiktoker name     0
 Tiktok name       0
 followers         0
 views(avg)        0
 likes(avg.)       0
 comments(avg.)    0
 shares(avg.)      0
 dtype: int64,
 'summary_stats':               Rank     followers    views(avg)   likes(avg.)  comments(avg.)  \
 count  1000.000000  1.000000e+03  1.000000e+03  1.000000e+03     1000.000000   
 mean    500.500000  8.071826e+06  3.111146e+06  3.384920e+05     2180.803000   
 std     288.819436  1.215482e+07  3.209477e+06  3.402547e+05     3406.895917   
 min       1.000000  1.030000e+04  5.319000e+05  2.320000e+04        0.000000   
 25%     250.750000  1.800000e+06  1.600000e+06  1.746000e+05      816.000000   
 50%     500.500000  4.400000e+06  2.300000e+06  2.502500e+05     1400.000000   
 75%     750.250000  9.525000e+06  3.300000e+06  3.795250e+05     2300.000000   
 max    1000.000000  1.531000e+08  4.890000e+07  3.600000e+06    64800.000000   
 
        shares(avg.)  
 count    1000.00000  
 mean

In [30]:
print(most_views.head())


              Tiktoker name  views(avg)
2                     yzn47  48900000.0
0                   mrbeast  29200000.0
7  yailinlamasviraloficial_  25000000.0
1                    karolg  23700000.0
6                mishayoung  21500000.0


In [31]:
most_likes.head()

Unnamed: 0,Tiktoker name,likes(avg.)
3,centralcee,3600000.0
0,mrbeast,3500000.0
1,karolg,3400000.0
4,adinross,3300000.0
5,thebrandonrobert,2900000.0


In [32]:
most_comments.head()


Unnamed: 0,Tiktoker name,comments(avg.)
41,xelitobelek,64800.0
11,enhypen,35000.0
0,mrbeast,30800.0
3,centralcee,23300.0
1,karolg,21700.0


In [33]:
most_shares.head()

Unnamed: 0,Tiktoker name,shares(avg.)
136,hf37777,80400.0
2,yzn47,60900.0
5,thebrandonrobert,50500.0
14,im.camber,37500.0
176,sromero29,34400.0


In [34]:
most_followers.head()

Unnamed: 0,Tiktoker name,followers
16,khaby.lame,153100000.0
37,charlidamelio,149000000.0
91,bellapoarch,92600000.0
168,addisonre,88700000.0
126,willsmith,73000000.0


In [35]:

# Normalize the metrics using Min/Max
scaler = MinMaxScaler()
metrics_columns = ['followers', 'likes(avg.)', 'views(avg)']
df_dec[metrics_columns] = scaler.fit_transform(df_dec[metrics_columns])
df_dec.head(10)


Unnamed: 0,Rank,Tiktoker name,Tiktok name,followers,views(avg),likes(avg.),comments(avg.),shares(avg.)
0,1,mrbeast,MrBeast,0.393819,0.592707,0.972042,30800.0,7200.0
1,2,karolg,Karol G,0.276895,0.478995,0.944084,21700.0,25700.0
2,3,yzn47,يزن الأسمر,0.058069,1.0,0.272646,16300.0,60900.0
3,4,centralcee,CentralCee,0.028674,0.398364,1.0,23300.0,24200.0
4,5,adinross,adin,0.039779,0.425241,0.916126,17500.0,25300.0
5,6,thebrandonrobert,Brandon Robert,0.075705,0.26191,0.804294,5700.0,50500.0
6,7,mishayoung,Міша Городецький 🇺🇦,0.029327,0.433511,0.664505,16300.0,24500.0
7,8,yailinlamasviraloficial_,Yailin La Más Viral,0.039779,0.505873,0.468799,16600.0,5300.0
8,9,daniel.labelle,Daniel LaBelle,0.185445,0.373554,0.496757,7500.0,19000.0
9,10,amauryguichon,Amaury Guichon,0.120124,0.323935,0.440841,7800.0,21400.0


In [36]:
# Thresholds for similarity 
followers_threshold = 0.1  
likes_threshold = 0.1  
views_threshold = 0.1  

# Initialize graph
G = nx.Graph()

# Add nodes for each influencer with their metrics
for idx, row in df_dec.iterrows():
    G.add_node(idx, name=row['Tiktoker name'], followers=row['followers'], likes=row['likes(avg.)'], views=row['views(avg)'])

# Function to check similarity 
def is_similar(value1, value2, threshold):
    return abs(value1 - value2) / max(value1, value2) <= threshold

# Iterate over pairs of influencers to add edges based on similarity in followers, likes, and views
for idx1, attrs1 in G.nodes(data=True):
    for idx2, attrs2 in G.nodes(data=True):
        if idx1 < idx2:  # Avoid repeating comparisons
            if is_similar(attrs1['followers'], attrs2['followers'], followers_threshold):
                G.add_edge(idx1, idx2, reason='followers')
            if is_similar(attrs1['likes'], attrs2['likes'], likes_threshold):
                G.add_edge(idx1, idx2, reason='likes')
            if is_similar(attrs1['views'], attrs2['views'], views_threshold):
                G.add_edge(idx1, idx2, reason='views')

nx.write_graphml(G, 'influencer_network.graphml')

# Convert node attributes to a DataFrame 
nodes_df = pd.DataFrame.from_dict(dict(G.nodes(data=True)), orient='index')

# Sort and print top 10 influencers by each metric
top_followers = nodes_df.sort_values(by='followers', ascending=False).head(10)
top_likes = nodes_df.sort_values(by='likes', ascending=False).head(10)
top_views = nodes_df.sort_values(by='views', ascending=False).head(10)

print("Top 10 Influencers by Followers:")
print(top_followers[['name', 'followers']])

print("\nTop 10 Influencers by Likes:")
print(top_likes[['name', 'likes']])

print("\nTop 10 Influencers by Views:")
print(top_views[['name', 'views']])


Top 10 Influencers by Followers:
                name  followers
16        khaby.lame   1.000000
37     charlidamelio   0.973218
91       bellapoarch   0.604807
168        addisonre   0.579332
126        willsmith   0.476777
99          zachking   0.470898
30   kimberly.loaiza   0.458487
469         cznburak   0.433012
59           therock   0.419948
62          domelipa   0.404924

Top 10 Influencers by Likes:
                name     likes
3         centralcee  1.000000
0            mrbeast  0.972042
1             karolg  0.944084
4           adinross  0.916126
5   thebrandonrobert  0.804294
6         mishayoung  0.664505
12         conangray  0.608589
19          juandamc  0.552673
10      urbantheory_  0.524715
13     elissadeheart  0.524715

Top 10 Influencers by Views:
                        name     views
2                      yzn47  1.000000
0                    mrbeast  0.592707
7   yailinlamasviraloficial_  0.505873
1                     karolg  0.478995
6                 m

In [37]:
closeness_centrality = nx.closeness_centrality(G)

# Sort influencers by their centrality measure
sorted_centrality = {k: v for k, v in sorted(closeness_centrality.items(), key=lambda item: item[1], reverse=True)}

# Display the top influencers based on closeness centrality
print("Top Influencers by Closeness Centrality:")
for idx in list(sorted_centrality)[:10]:
    print(f"{df_dec.loc[idx, 'Tiktoker name']}: {sorted_centrality[idx]}")


Top Influencers by Closeness Centrality:
noholito: 0.5900767867690491
itsceceh: 0.5893805309734513
kunaguero: 0.5890330188679245
murphslife: 0.588339222614841
herranwalt: 0.588339222614841
esen_alva: 0.588339222614841
axelviewoficial: 0.5873015873015873
douhalaribiii: 0.5873015873015873
thejoeyswoll: 0.5866118614210217
andresgjohnson: 0.5862676056338029
