# Question-1

Imagine you have a dataset where you have different Instagram features
like u sername , Caption , Hashtag , Followers , Time_Since_posted , and likes , now your task is
to predict the number of likes and Time Since posted and the rest of the features are
your input features. Now you have to build a model which can predict the
number of likes and Time Since posted.

Data_link - https://www.kaggle.com/datasets/rxsraghavagrawal/instagram-reach

In [1]:
#              Importing necessary libraries
import numpy as np     # for numerical operation
import pandas as pd    # for data analysis
import seaborn as sns    # for visualisation
import matplotlib.pyplot as plt   # for visualisation
%matplotlib inline

from sklearn.model_selection import train_test_split    # splitting data
#             Importing nessary ML algorithms
from sklearn.linear_model import LinearRegression        
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('instagram_reach.csv')   #read the data

In [3]:
data.head()                # give 1st five rows

Unnamed: 0.1,Unnamed: 0,S.No,USERNAME,Caption,Followers,Hashtags,Time since posted,Likes
0,0,1,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,11 hours,139
1,1,2,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,2 hours,23
2,2,3,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,2 hours,25
3,3,4,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,3 hours,49
4,4,5,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,3 hours,30


In [4]:
data.info()    # gives information of data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         100 non-null    int64 
 1   S.No               100 non-null    int64 
 2   USERNAME           100 non-null    object
 3   Caption            94 non-null     object
 4   Followers          100 non-null    int64 
 5   Hashtags           100 non-null    object
 6   Time since posted  100 non-null    object
 7   Likes              100 non-null    int64 
dtypes: int64(4), object(4)
memory usage: 6.4+ KB


In [5]:
data.isnull().sum()      # Checking the null values

Unnamed: 0           0
S.No                 0
USERNAME             0
Caption              6
Followers            0
Hashtags             0
Time since posted    0
Likes                0
dtype: int64

In [6]:
# Remove any non-digit characters except for '-' character
data['Time since posted'] = data['Time since posted'].str.replace(r'[^0-9-]+', '')

In [7]:
# Convert the 'Time since posted' column to numeric format
data['Time since posted'] = pd.to_numeric(data['Time since posted'], errors='coerce')

In [8]:
data.head()

Unnamed: 0.1,Unnamed: 0,S.No,USERNAME,Caption,Followers,Hashtags,Time since posted,Likes
0,0,1,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,11,139
1,1,2,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,2,23
2,2,3,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,2,25
3,3,4,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,3,49
4,4,5,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,3,30


In [9]:
data.info()     # gives information of data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         100 non-null    int64 
 1   S.No               100 non-null    int64 
 2   USERNAME           100 non-null    object
 3   Caption            94 non-null     object
 4   Followers          100 non-null    int64 
 5   Hashtags           100 non-null    object
 6   Time since posted  100 non-null    int64 
 7   Likes              100 non-null    int64 
dtypes: int64(5), object(3)
memory usage: 6.4+ KB


In [10]:
# Feature: Length of the caption
data['caption_length'] = data['Caption'].apply(lambda x: len(x) if isinstance(x, str) else 0)

In [11]:
# Feature: Number of hashtags
data['hashtag_count'] = data['Hashtags'].apply(lambda x: len(x.split()))

In [12]:
data.head()    # give 1st five rows

Unnamed: 0.1,Unnamed: 0,S.No,USERNAME,Caption,Followers,Hashtags,Time since posted,Likes,caption_length,hashtag_count
0,0,1,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,11,139,149,5
1,1,2,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,2,23,454,19
2,2,3,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,2,25,704,20
3,3,4,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,3,49,4,20
4,4,5,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,3,30,322,18


In [13]:
data1 = data.drop(['Caption', 'Hashtags'], axis=1)     # Drop the original columns

In [14]:
data1 = data1.drop(['USERNAME','S.No','Unnamed: 0'], axis=1)     # Drop unnecessary column

In [15]:
data1.head()             # give 1st five rows

Unnamed: 0,Followers,Time since posted,Likes,caption_length,hashtag_count
0,1600,11,139,149,5
1,880,2,23,454,19
2,255,2,25,704,20
3,340,3,49,4,20
4,304,3,30,322,18


In [16]:
data1.info()  # gives information of data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   Followers          100 non-null    int64
 1   Time since posted  100 non-null    int64
 2   Likes              100 non-null    int64
 3   caption_length     100 non-null    int64
 4   hashtag_count      100 non-null    int64
dtypes: int64(5)
memory usage: 4.0 KB


In [19]:
import copy
data2 = copy.deepcopy(data1)  # copy of data

In [20]:
# Split the data into input features (X) and target variables (y)
X = data1[['caption_length', 'hashtag_count', 'Followers']]
y_likes = data1['Likes']
# y_time_since_posted = data1['Time since posted']

In [21]:
#                                    Importing nessary ML algorithms
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [22]:
X_train,X_test,y_train,y_test=train_test_split(X,y_likes,test_size=0.20,random_state=42) # get train test data

In [23]:
models={
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet(),
    'RandomForest': RandomForestRegressor()
}    # models inside dictionary

In [24]:
def evaluate_model(X_train,X_test,y_train,y_test,models):   # function for getting best model
    report={}
    for i in range(len(models)):
        model=list(models.values())[i]
        model.fit(X_train,y_train)
        y_pred=model.predict(X_test)
        score_r2=round(r2_score(y_test,y_pred)*100,2)
        report[list(models.keys())[i]]=score_r2
        best_score=max(list(report.values()))
        best_model=list(filter(lambda x:report[x]==best_score,report))[0]
    print(f'The best model is {best_model} with r2 score: {best_score}')
    return report
    

In [25]:
evaluate_model(X_train,X_test,y_train,y_test,models)  # call function

The best model is RandomForest with r2 score: 32.54


{'LinearRegression': 18.73,
 'Lasso': 18.67,
 'Ridge': 18.73,
 'ElasticNet': 18.68,
 'RandomForest': 32.54}

In [26]:
# Split the data into input features (X) and target variables (y)
X = data1[['caption_length', 'hashtag_count', 'Followers']]
# y_likes = data1['Likes']
y_time_since_posted = data1['Time since posted']

In [27]:
X_train,X_test,y_train,y_test=train_test_split(X,y_time_since_posted,test_size=0.20,random_state=42)  # get train test data

In [28]:
models={
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet(),
    'RandomForest': RandomForestRegressor()
} # models inside dictionary

In [29]:
def evaluate_model(X_train,X_test,y_train,y_test,models): # function for getting best model
    report={}
    for i in range(len(models)):
        model=list(models.values())[i]
        model.fit(X_train,y_train)
        y_pred=model.predict(X_test)
        score_r2=round(r2_score(y_test,y_pred)*100,2)
        report[list(models.keys())[i]]=score_r2
        best_score=max(list(report.values()))
        best_model=list(filter(lambda x:report[x]==best_score,report))[0]
    print(f'The best model is {best_model} with r2 score: {best_score}')
    return report
    

In [30]:
evaluate_model(X_train,X_test,y_train,y_test,models) # call function

The best model is RandomForest with r2 score: 62.91


{'LinearRegression': 2.34,
 'Lasso': 2.2,
 'Ridge': 2.34,
 'ElasticNet': 2.2,
 'RandomForest': 62.91}