In [92]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

In [2]:
df1 = pd.read_csv("Datasets/instagram_reach.csv")
df1.sample()

Unnamed: 0.1,Unnamed: 0,S.No,USERNAME,Caption,Followers,Hashtags,Time since posted,Likes
96,9,21,dvlp_search,Credit @tristankappel To find more dvlp follow...,450,#workspace #work #developer#development #devel...,3 hours,42


In [3]:
df1.reset_index(drop=True)
df1["Time"] = df1["Time since posted"].apply(lambda x: int(x.split(" ")[0]))
df1.drop(columns=["Unnamed: 0","S.No", "Caption", "USERNAME","Time since posted"], inplace=True)

In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Followers  100 non-null    int64 
 1   Hashtags   100 non-null    object
 2   Likes      100 non-null    int64 
 3   Time       100 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 3.2+ KB


In [5]:
df1.describe()

Unnamed: 0,Followers,Likes,Time
count,100.0,100.0,100.0
mean,961.96,46.48,3.46
std,1014.62567,55.08698,3.394648
min,11.0,8.0,2.0
25%,252.75,19.0,2.0
50%,612.0,29.0,2.0
75%,1197.0,46.0,3.0
max,4496.0,349.0,24.0


In [6]:
print(df1.loc[:0]["Hashtags"])

0    #MachineLearning #AI #DataAnalytics #DataScien...
Name: Hashtags, dtype: object


In [7]:
def convert_hastag(text):
    ps = PorterStemmer()
    wordnet = WordNetLemmatizer()
    
    tags = nltk.sent_tokenize(text)
    corpus = []
    tag = re.sub("[^a-zA-Z]", " ", text)
    tag = tag.lower()
    tag = tag.split()
#     tag = [wordnet.lemmatize(word) for word in tag if not word in set(stopwords.words("english"))]
    
    return tag

convert_hastag(df1["Hashtags"][0])

['machinelearning', 'ai', 'dataanalytics', 'datascienc', 'datalake']

In [8]:
hastag = pd.DataFrame()
df1["Hashtags"]= df1["Hashtags"].apply(convert_hastag)

In [9]:
print(df1[["Hashtags"]])

                                             Hashtags
0   [machinelearning, ai, dataanalytics, datascien...
1   [deck, mac, macintosh, sayhello, apple, stevej...
2   [whoiswho, aitrading, ai, aitradingteam, insta...
3   [iot, cre, workplace, cdo, bigdata, technology...
4   [instamachinelearning, instabigdata, instamark...
..                                                ...
95  [beverlyhills, realestate, losangelesrealestat...
96  [workspace, work, developer, development, deve...
97  [books, book, motivation, inspiration, life, b...
98  [heavyequipment, underconstruction, dozer, rea...
99  [marketing, programming, development, desarrol...

[100 rows x 1 columns]


In [10]:
unique_hashtag = set(tag for row in df1["Hashtags"] for tag in row)
len(unique_hashtag)

1155

In [11]:
for hashtag in unique_hashtag:
    df1[hashtag] = 0 # initialize 0 value for every hashtag


In [12]:
for index, row in enumerate(df1["Hashtags"]): # enumerate hashtags 
    for tag in row: # Row wise hashtag 
        df1.at[index, tag] = 1 # appply 1 value when index and tag match 


In [13]:
df1.drop(columns=["Hashtags"], inplace=True)

In [18]:
x1 = df1.drop(columns=["Likes", "Time"])
# x1 = np.array(x1)

In [31]:
# y_like = df1['Likes']
# y_time = df1['Time']
y1 = df1[["Likes", "Time"]]

In [83]:
# x1_train, x1_test, y_like_train, y_like_test, y_time_train, y_time_test = train_test_split(x1, y_like, y_time, test_size=0.2, random_state=11)
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.1)

In [84]:
print(f"X_train shape = {x1_train.shape}")
print(f"X_test shape = {x1_test.shape}")
print(f"y_train shape = {y1_train.shape}")
print(f"y_test shape = {y1_test.shape}")

X_train shape = (90, 1156)
X_test shape = (10, 1156)
y_train shape = (90, 2)
y_test shape = (10, 2)


In [88]:
y1_test

Unnamed: 0,Likes,Time
72,20,2
38,18,2
15,53,3
65,29,2
89,16,2
27,148,20
2,25,2
24,10,2
20,198,5
97,10,3


In [93]:
rf = RandomForestRegressor()

In [94]:
rf.fit(x1_train, y1_train)

RandomForestRegressor()

In [101]:
y1_pred = rf.predict(x1_test)
y1_pred

array([[31.85,  2.43],
       [26.41,  2.15],
       [31.57,  2.48],
       [38.67,  3.01],
       [32.08,  2.15],
       [66.26,  4.69],
       [23.76,  3.24],
       [30.75,  3.71],
       [35.94,  3.27],
       [18.37,  2.18]])

In [96]:
mse1 = mean_squared_error(y1_test, y_pred)
mse1

1735.6771250000002

In [97]:
rmse1 = np.sqrt(mse)
rmse1

41.66145850783431

In [100]:
r2score1 = r2_score(y1_test, y_pred)
r2score1

0.12314420265467868