In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

In [2]:
df1 = pd.read_csv("Datasets/instagram_reach.csv")
df1.sample()

Unnamed: 0.1,Unnamed: 0,S.No,USERNAME,Caption,Followers,Hashtags,Time since posted,Likes
72,10,25,kamalalshehabi,,955,#qoute #success#motivation #entrepreneur #insp...,2 hours,20


In [3]:
df1.reset_index(drop=True)
df1["Time"] = df1["Time since posted"].apply(lambda x: int(x.split(" ")[0]))
df1.drop(columns=["Unnamed: 0","S.No", "Caption", "USERNAME","Time since posted"], inplace=True)

In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Followers  100 non-null    int64 
 1   Hashtags   100 non-null    object
 2   Likes      100 non-null    int64 
 3   Time       100 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 3.2+ KB


In [5]:
df1.describe()

Unnamed: 0,Followers,Likes,Time
count,100.0,100.0,100.0
mean,961.96,46.48,3.46
std,1014.62567,55.08698,3.394648
min,11.0,8.0,2.0
25%,252.75,19.0,2.0
50%,612.0,29.0,2.0
75%,1197.0,46.0,3.0
max,4496.0,349.0,24.0


In [6]:
print(df1.loc[:0]["Hashtags"])

0    #MachineLearning #AI #DataAnalytics #DataScien...
Name: Hashtags, dtype: object


In [7]:
def convert_hastag(text):
    ps = PorterStemmer()
    wordnet = WordNetLemmatizer()
    
    tags = nltk.sent_tokenize(text)
    corpus = []
    tag = re.sub("[^a-zA-Z]", " ", text)
    tag = tag.lower()
    tag = tag.split()
#     tag = [wordnet.lemmatize(word) for word in tag if not word in set(stopwords.words("english"))]
    
    return tag

convert_hastag(df1["Hashtags"][0])

['machinelearning', 'ai', 'dataanalytics', 'datascienc', 'datalake']

In [8]:
hastag = pd.DataFrame()
df1["Hashtags"]= df1["Hashtags"].apply(convert_hastag)

In [9]:
print(df1[["Hashtags"]])

                                             Hashtags
0   [machinelearning, ai, dataanalytics, datascien...
1   [deck, mac, macintosh, sayhello, apple, stevej...
2   [whoiswho, aitrading, ai, aitradingteam, insta...
3   [iot, cre, workplace, cdo, bigdata, technology...
4   [instamachinelearning, instabigdata, instamark...
..                                                ...
95  [beverlyhills, realestate, losangelesrealestat...
96  [workspace, work, developer, development, deve...
97  [books, book, motivation, inspiration, life, b...
98  [heavyequipment, underconstruction, dozer, rea...
99  [marketing, programming, development, desarrol...

[100 rows x 1 columns]


In [10]:
unique_hashtag = set(tag for row in df1["Hashtags"] for tag in row)
len(unique_hashtag)

1155

In [11]:
for hashtag in unique_hashtag:
    df1[hashtag] = 0 # initialize 0 value for every hashtag


In [12]:
for index, row in enumerate(df1["Hashtags"]): # enumerate hashtags 
    for tag in row: # Row wise hashtag 
        df1.at[index, tag] = 1 # appply 1 value when index and tag match 


In [13]:
df1.drop(columns=["Hashtags"], inplace=True)

In [14]:
x1 = df1.drop(columns=["Likes", "Time"])

In [15]:
y1 = df1[["Likes", "Time"]]

In [16]:
# x1_train, x1_test, y_like_train, y_like_test, y_time_train, y_time_test = train_test_split(x1, y_like, y_time, test_size=0.2, random_state=11)
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.1)

In [17]:
print(f"X_train shape = {x1_train.shape}")
print(f"X_test shape = {x1_test.shape}")
print(f"y_train shape = {y1_train.shape}")
print(f"y_test shape = {y1_test.shape}")

X_train shape = (90, 1156)
X_test shape = (10, 1156)
y_train shape = (90, 2)
y_test shape = (10, 2)


In [18]:
y1_test

Unnamed: 0,Likes,Time
9,18,7
60,16,2
57,10,2
18,30,4
96,42,3
35,13,2
44,16,2
20,198,5
94,24,3
41,8,2


In [19]:
rf = RandomForestRegressor()

In [20]:
rf.fit(x1_train, y1_train)

RandomForestRegressor()

In [21]:
y1_pred = rf.predict(x1_test)
y1_pred

array([[43.89 ,  3.96 ],
       [34.12 ,  3.74 ],
       [36.66 ,  3.29 ],
       [18.53 ,  2.9  ],
       [41.45 ,  3.54 ],
       [23.695,  2.275],
       [22.83 ,  2.77 ],
       [37.55 ,  3.37 ],
       [21.62 ,  2.88 ],
       [20.82 ,  3.06 ]])

In [22]:
mse1 = mean_squared_error(y1_test, y1_pred)
mse1

1396.8197524999998

In [23]:
rmse1 = np.sqrt(mse1)
rmse1

37.37405186088337

In [24]:
r2score1 = r2_score(y1_test, y1_pred)
r2score1

0.13900848159262064

In [25]:
df2 = pd.read_csv("DataSets/ObesityDataSet_raw_and_data_sinthetic.csv")

In [26]:
df2.isnull().sum()

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [27]:
df2.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.3126,1.701677,86.586058,2.419043,2.685628,2.008011,1.010298,0.657866
std,6.345968,0.093305,26.191172,0.533927,0.778039,0.612953,0.850592,0.608927
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,19.947192,1.63,65.473343,2.0,2.658738,1.584812,0.124505,0.0
50%,22.77789,1.700499,83.0,2.385502,3.0,2.0,1.0,0.62535
75%,26.0,1.768464,107.430682,3.0,3.0,2.47742,1.666678,1.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


In [28]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [29]:
df2.sample()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
1438,Female,40.789529,1.549748,80.0,yes,yes,2.0,1.099151,Sometimes,no,1.687611,no,1.874662,0.0,Sometimes,Automobile,Obesity_Type_I


In [30]:
df2["SCC"].unique()

array(['no', 'yes'], dtype=object)

In [31]:
df2["Gender"] = df2["Gender"].apply(lambda x : 1 if x == "Male" else 0)

In [32]:
df2["SMOKE"] = df2["SMOKE"].apply(lambda x : 1 if x == "yes" else 0)

In [33]:
df2["family_history_with_overweight"] = df2["family_history_with_overweight"].apply(lambda x : 1 if x == "yes" else 0)

In [34]:
df2["FAVC"] = df2["FAVC"].apply(lambda x : 1 if x == "yes" else 0)

In [35]:
df2["SCC"] = df2["SCC"].apply(lambda x : 1 if x == "yes" else 0)

In [36]:
df2

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,21.000000,1.620000,64.000000,1,0,2.0,3.0,Sometimes,0,2.000000,0,0.000000,1.000000,no,Public_Transportation,Normal_Weight
1,0,21.000000,1.520000,56.000000,1,0,3.0,3.0,Sometimes,1,3.000000,1,3.000000,0.000000,Sometimes,Public_Transportation,Normal_Weight
2,1,23.000000,1.800000,77.000000,1,0,2.0,3.0,Sometimes,0,2.000000,0,2.000000,1.000000,Frequently,Public_Transportation,Normal_Weight
3,1,27.000000,1.800000,87.000000,0,0,3.0,3.0,Sometimes,0,2.000000,0,2.000000,0.000000,Frequently,Walking,Overweight_Level_I
4,1,22.000000,1.780000,89.800000,0,0,2.0,1.0,Sometimes,0,2.000000,0,0.000000,0.000000,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,0,20.976842,1.710730,131.408528,1,1,3.0,3.0,Sometimes,0,1.728139,0,1.676269,0.906247,Sometimes,Public_Transportation,Obesity_Type_III
2107,0,21.982942,1.748584,133.742943,1,1,3.0,3.0,Sometimes,0,2.005130,0,1.341390,0.599270,Sometimes,Public_Transportation,Obesity_Type_III
2108,0,22.524036,1.752206,133.689352,1,1,3.0,3.0,Sometimes,0,2.054193,0,1.414209,0.646288,Sometimes,Public_Transportation,Obesity_Type_III
2109,0,24.361936,1.739450,133.346641,1,1,3.0,3.0,Sometimes,0,2.852339,0,1.139107,0.586035,Sometimes,Public_Transportation,Obesity_Type_III


In [42]:
df2["MTRANS"].unique()

array(['Public_Transportation', 'Walking', 'Automobile', 'Motorbike',
       'Bike'], dtype=object)

In [38]:
label_encoder = LabelEncoder()

In [39]:
df2["CAEC"] =label_encoder.fit_transform(df2["CAEC"])

In [40]:
df2["CALC"] = label_encoder.fit_transform(df2["CALC"])

In [43]:
df2["MTRANS"] = label_encoder.fit_transform(df2["MTRANS"])

In [45]:
df2["NObeyesdad"] = label_encoder.fit_transform(df2["NObeyesdad"])

In [46]:
df2.sample(5)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
1363,1,31.335093,1.665798,89.738596,1,1,2.274164,1.049534,2,0,1.358172,0,1.482411,0.0,2,0,2
1593,1,23.47007,1.842906,121.142535,1,1,3.0,2.701689,2,0,2.738485,0,0.992253,0.0,2,3,3
2004,0,26.0,1.633945,111.9307,1,1,3.0,3.0,2,0,2.682804,0,0.0,0.15171,2,3,4
960,0,17.992717,1.618683,67.193585,1,1,1.952987,1.0,2,0,1.334856,1,0.732276,1.890214,3,3,5
644,1,17.580627,1.770324,55.695253,1,1,2.0,4.0,2,0,2.369627,0,2.0,1.612466,3,0,0


In [48]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   int64  
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   int64  
 5   FAVC                            2111 non-null   int64  
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   int32  
 9   SMOKE                           2111 non-null   int64  
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   int64  
 12  FAF                             21