In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import scipy.stats

In [6]:
flwrs = pd.read_csv("../input/ig-data/followers_views.csv")
usrs = pd.read_csv("../input/ig-data/usersData.csv")
likes = pd.read_csv("../input/ig-data/likes_views.csv")

In [7]:
flwrs.head()

In [8]:
usrs.head()

In [9]:
likes.head()

In [10]:
flwrs.isnull().sum()

In [11]:
usrs.isnull().sum()

In [12]:
likes.isnull().sum()

So we have the three dataframes loaded - no null values found. Let's now graph the data from the paper. This is:
1. log average views per number of instagrammers
2. followers per views
3. likes per views


In [13]:
df = np.log(usrs.avg_views) #log transform column first
df.hist(bins=100) #make histogram

plt.xscale("log")
plt.xlabel("Log average views")
plt.ylabel("Number of instagrammers")

#add mean line
plt.axvline(x=df.mean(),linestyle='--',color='red')

print(usrs.avg_views.mean()) #average views
#print(np.exp(np.log(usrs.avg_views.mean()))) #why does this not work
print(np.exp(df.mean())) #average log views

plt.show()

From the paper, we can fit a normal distribution to this data:
"Furthermore,as this distribution is so close to normal, we ascertain that our selection of sampled Instagrammers is a good semblance of real-world
influence with micro-influencers populating the dense mean and casual users and celebrities appearing at the distribution extremes"

In [14]:
import seaborn as sns
from scipy.stats import norm

# Fit Gaussian distribution and plot
sns.distplot(df, fit=norm, kde=False)

For the statistics, we need to adjust the data used according to the paper:
"To avoid these sorts of odd behaviors, we performed univariate
outliers removal, ignoring the top and bottom posts for users with
posts statistics above 2 standard deviations."

In [15]:
from scipy.stats import zscore

z_scores = scipy.stats.zscore(flwrs.views)
#calculate z-scores of dataframe
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 2)
new_df = flwrs[filtered_entries]

print(flwrs.views.mean())
print(new_df.views.mean())

In [16]:
plt.scatter(new_df.followers, new_df.views)
plt.xscale("log")
plt.yscale("log")
plt.xlabel("Followers")
plt.ylabel("Views")

plt.xlim(0.1, 10**9)  
plt.ylim(0.1, 10**8) 

#add mean line
#plt.axvline(x=new_df.mean(),linestyle='--',color='red')

plt.show()

So it seems like they used all the data for these graphs, and did not remove the >2 st.dev outliers. Let's therefore graph those:

In [22]:
plt.scatter(flwrs.followers, flwrs.views)
plt.xscale("log")
plt.yscale("log")
plt.xlabel("Followers")
plt.ylabel("Views")

plt.xlim(0.1, 10**9)  
plt.ylim(0.1, 10**8) 
plt.show()

#add mean line
#plt.axvline(x=flwrs.followers,y = flwrs.views, linestyle='--',color='red')

m, b = np.polyfit(flwrs.followers,flwrs.views, 1)

x=flwrs.followers
y=flwrs.views
plt.plot(x, y, '.')
plt.plot(x, m*x + b)
plt.ylim(0.1, 0.5*10**8) 
plt.xscale("log")
plt.yscale("log")
plt.xlabel("Followers")
plt.ylabel("Views")

plt.show()

In [24]:
z_scores = scipy.stats.zscore(likes.views)
#calculate z-scores of dataframe
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 2)
new_df_likes_views = likes[filtered_entries]

print(likes.views.mean())
print(new_df_likes_views.views.mean())

In [31]:
plt.plot(new_df_likes_views.likes,new_df_likes_views.views,".")

plt.xlabel("Engagement")
plt.ylabel("Views")
plt.xscale("log")
plt.yscale("log")
plt.xlim(0.1, 10**7)  
plt.ylim(0.1, 10**7) 
plt.show()


plt.plot(likes.likes,likes.views,".")
plt.xlabel("Engagement")
plt.ylabel("Views")
plt.xscale("log")
plt.yscale("log")
plt.xlim(0.1, 10**7)  
plt.ylim(0.1, 10**8) 
plt.show()

m, b = np.polyfit(likes.likes,likes.views, 1)
x,y = likes.likes,likes.views
plt.plot(x, y, '.')
plt.plot(x, m*x + b)
plt.ylim(0.1, 10**8) 
plt.xscale("log")
plt.yscale("log")
plt.xlabel("Likes")
plt.ylabel("Views")
plt.show()

In [37]:
#create scatterplot with regression line and confidence interval lines
g=sns.regplot(x=x, y=y, fit_reg=True, ci=None)
g.set(ylim(0, 5*10**7))
g.set(xlim(0,10**7))

g.set(xscale("log"))
g.set(yscale("log"))

In [43]:
import sklearn
from sklearn.model_selection import train_test_split

#select data
X, y = usrs.drop("avg_views", 1), usrs.avg_views
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#fit to RFR
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=0).fit(X_train, y_train)

#R2 scores for train and test data
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))


In [44]:
from sklearn.linear_model import Ridge

model = Ridge(random_state=0).fit(X_train, y_train)

#R2 scores for train and test data
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))