In [27]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [28]:
df = pd.read_csv(r'D:\gpu_libraries\assienment\archive\instagram_reach.csv',index_col= 'S.No')
df.drop(columns = ['Unnamed: 0'],inplace=True)
df.head()

Unnamed: 0_level_0,USERNAME,Caption,Followers,Hashtags,Time since posted,Likes
S.No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,11 hours,139
2,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,2 hours,23
3,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,2 hours,25
4,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,3 hours,49
5,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,3 hours,30


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 1 to 25
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   USERNAME           100 non-null    object
 1   Caption            94 non-null     object
 2   Followers          100 non-null    int64 
 3   Hashtags           100 non-null    object
 4   Time since posted  100 non-null    object
 5   Likes              100 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 5.5+ KB


In [30]:
import re
df['Hours_Since_Posted'] = df['Time since posted'].apply(lambda x: re.search(r'\d+', str(x)).group() if re.search(r'\d+', str(x)) else None)
df['Hours_Since_Posted'] = pd.to_numeric(df['Hours_Since_Posted'], errors='coerce')

print(df[['Time since posted', 'Hours_Since_Posted']])


     Time since posted  Hours_Since_Posted
S.No                                      
1             11 hours                  11
2              2 hours                   2
3              2 hours                   2
4              3 hours                   3
5              3 hours                   3
...                ...                 ...
19             3 hours                   3
21             3 hours                   3
22             3 hours                   3
24             3 hours                   3
25             3 hours                   3

[100 rows x 2 columns]


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 1 to 25
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   USERNAME            100 non-null    object
 1   Caption             94 non-null     object
 2   Followers           100 non-null    int64 
 3   Hashtags            100 non-null    object
 4   Time since posted   100 non-null    object
 5   Likes               100 non-null    int64 
 6   Hours_Since_Posted  100 non-null    int64 
dtypes: int64(3), object(4)
memory usage: 6.2+ KB


In [32]:
df.drop(columns = ['Time since posted'],inplace = True)

# Handling Missing values in captions 

In [33]:
df['Caption'].fillna('Missing', inplace=True)

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 1 to 25
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   USERNAME            100 non-null    object
 1   Caption             100 non-null    object
 2   Followers           100 non-null    int64 
 3   Hashtags            100 non-null    object
 4   Likes               100 non-null    int64 
 5   Hours_Since_Posted  100 non-null    int64 
dtypes: int64(3), object(3)
memory usage: 5.5+ KB


In [35]:
columns1 = df.select_dtypes(include = 'object').columns
print(columns1)

Index(['USERNAME', 'Caption', 'Hashtags'], dtype='object')


In [36]:
df[columns1] = df[columns1].astype('category')

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 1 to 25
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   USERNAME            100 non-null    category
 1   Caption             100 non-null    category
 2   Followers           100 non-null    int64   
 3   Hashtags            100 non-null    category
 4   Likes               100 non-null    int64   
 5   Hours_Since_Posted  100 non-null    int64   
dtypes: category(3), int64(3)
memory usage: 11.8 KB


In [38]:
X_likes_train, X_likes_test, y_likes_train, y_likes_test= train_test_split(df.drop(columns = ['Likes'],axis = 1),
                                                                            df['Likes'],
                                                                             test_size=0.2,
                                                                              random_state=42)

X_train, X_test, y_time_train, y_time_test = train_test_split(df.drop(columns = ['Hours_Since_Posted'],axis = 1),
                                                                df['Hours_Since_Posted'], 
                                                                test_size=0.2, 
                                                                random_state=42)

In [40]:
from sklearn.preprocessing import OneHotEncoder
encoder1 = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_likes_train= encoder1.fit_transform(X_likes_train)
X_likes_test= encoder1.transform(X_likes_test)



In [41]:
encoder2 = OneHotEncoder(sparse=False, handle_unknown='ignore')
x_time_train = encoder2.fit_transform(X_train)
x_time_test= encoder2.transform(X_test)



In [43]:
# Create and train a RandomForestRegressor for predicting likes
model_likes = RandomForestRegressor()
model_likes.fit(X_likes_train, y_likes_train)

In [44]:
# Create and train a RandomForestRegressor for predicting time since posted
model_time = RandomForestRegressor()
model_time.fit(x_time_train, y_time_train)

In [45]:
likes_predictions = model_likes.predict(X_likes_test)
time_predictions = model_time.predict(x_time_test)

In [46]:
likes_mse = mean_squared_error(y_likes_test, likes_predictions)
time_mse = mean_squared_error(y_time_test, time_predictions)
print("Likes MSE:", likes_mse)
print("Time MSE:", time_mse)

Likes MSE: 1639.564185
Time MSE: 15.43492
