In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

In [2]:
df1 = pd.read_csv("Datasets/instagram_reach.csv")
df1.sample()

Unnamed: 0.1,Unnamed: 0,S.No,USERNAME,Caption,Followers,Hashtags,Time since posted,Likes
28,1,7,philosophercoin,Check my profile to find out about more fundam...,279,#Bitcoin #Cryptocurrency#Ethereum #stockmarket...,4 hours,43


In [3]:
df1.reset_index(drop=True)
df1["Time"] = df1["Time since posted"].apply(lambda x: int(x.split(" ")[0]))
df1.drop(columns=["Unnamed: 0","S.No", "Caption", "USERNAME","Time since posted"], inplace=True)

In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Followers  100 non-null    int64 
 1   Hashtags   100 non-null    object
 2   Likes      100 non-null    int64 
 3   Time       100 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 3.2+ KB


In [5]:
df1.describe()

Unnamed: 0,Followers,Likes,Time
count,100.0,100.0,100.0
mean,961.96,46.48,3.46
std,1014.62567,55.08698,3.394648
min,11.0,8.0,2.0
25%,252.75,19.0,2.0
50%,612.0,29.0,2.0
75%,1197.0,46.0,3.0
max,4496.0,349.0,24.0


In [6]:
print(df1.loc[:0]["Hashtags"])

0    #MachineLearning #AI #DataAnalytics #DataScien...
Name: Hashtags, dtype: object


In [7]:
def convert_hastag(text):
    ps = PorterStemmer()
    wordnet = WordNetLemmatizer()
    
    tags = nltk.sent_tokenize(text)
    corpus = []
    tag = re.sub("[^a-zA-Z]", " ", text)
    tag = tag.lower()
    tag = tag.split()
#     tag = [wordnet.lemmatize(word) for word in tag if not word in set(stopwords.words("english"))]
    
    return tag

convert_hastag(df1["Hashtags"][0])

['machinelearning', 'ai', 'dataanalytics', 'datascienc', 'datalake']

In [8]:
hastag = pd.DataFrame()
df1["Hashtags"]= df1["Hashtags"].apply(convert_hastag)

In [9]:
print(df1[["Hashtags"]])

                                             Hashtags
0   [machinelearning, ai, dataanalytics, datascien...
1   [deck, mac, macintosh, sayhello, apple, stevej...
2   [whoiswho, aitrading, ai, aitradingteam, insta...
3   [iot, cre, workplace, cdo, bigdata, technology...
4   [instamachinelearning, instabigdata, instamark...
..                                                ...
95  [beverlyhills, realestate, losangelesrealestat...
96  [workspace, work, developer, development, deve...
97  [books, book, motivation, inspiration, life, b...
98  [heavyequipment, underconstruction, dozer, rea...
99  [marketing, programming, development, desarrol...

[100 rows x 1 columns]


In [10]:
unique_hashtag = set(tag for row in df1["Hashtags"] for tag in row)
len(unique_hashtag)

1155

In [11]:
for hashtag in unique_hashtag:
    df1[hashtag] = 0 # initialize 0 value for every hashtag


In [12]:
for index, row in enumerate(df1["Hashtags"]): # enumerate hashtags 
    for tag in row: # Row wise hashtag 
        df1.at[index, tag] = 1 # appply 1 value when index and tag match 


In [13]:
df1.drop(columns=["Hashtags"], inplace=True)

In [14]:
x1 = df1.drop(columns=["Likes", "Time"])

In [15]:
y1 = df1[["Likes", "Time"]]

In [16]:
# x1_train, x1_test, y_like_train, y_like_test, y_time_train, y_time_test = train_test_split(x1, y_like, y_time, test_size=0.2, random_state=11)
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.1)

In [17]:
print(f"X_train shape = {x1_train.shape}")
print(f"X_test shape = {x1_test.shape}")
print(f"y_train shape = {y1_train.shape}")
print(f"y_test shape = {y1_test.shape}")

X_train shape = (90, 1156)
X_test shape = (10, 1156)
y_train shape = (90, 2)
y_test shape = (10, 2)


In [18]:
y1_test

Unnamed: 0,Likes,Time
30,90,2
49,20,4
62,27,2
96,42,3
3,49,3
64,19,2
2,25,2
45,31,2
36,93,2
23,11,2


In [19]:
rf = RandomForestRegressor()

In [20]:
rf.fit(x1_train, y1_train)

RandomForestRegressor()

In [21]:
y1_pred = rf.predict(x1_test)
y1_pred

array([[31.79      ,  2.45333333],
       [54.466     ,  3.75533333],
       [29.32      ,  2.43      ],
       [38.81      ,  3.38      ],
       [26.73      ,  3.7       ],
       [20.84      ,  2.46      ],
       [23.17      ,  2.98666667],
       [16.9       ,  2.47      ],
       [28.376     ,  2.34533333],
       [33.95      ,  3.94      ]])

In [22]:
mse1 = mean_squared_error(y1_test, y1_pred)
mse1

500.1351285555555

In [23]:
rmse1 = np.sqrt(mse1)
rmse1

22.363701137234766

In [24]:
r2score1 = r2_score(y1_test, y1_pred)
r2score1

-0.38661649466063086

In [25]:
df2 = pd.read_csv("DataSets/ObesityDataSet_raw_and_data_sinthetic.csv")

In [26]:
df2.isnull().sum()

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [27]:
df2.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.3126,1.701677,86.586058,2.419043,2.685628,2.008011,1.010298,0.657866
std,6.345968,0.093305,26.191172,0.533927,0.778039,0.612953,0.850592,0.608927
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,19.947192,1.63,65.473343,2.0,2.658738,1.584812,0.124505,0.0
50%,22.77789,1.700499,83.0,2.385502,3.0,2.0,1.0,0.62535
75%,26.0,1.768464,107.430682,3.0,3.0,2.47742,1.666678,1.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


In [28]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [66]:
df2.sample(5)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
726,0,20,1,41,0,1,2.673638,2.779379,1,0,1.249074,0,0.043412,0.403694,2,3,0
67,1,23,1,95,1,1,2.0,3.0,0,0,2.0,0,0.0,1.0,1,0,2
236,0,21,1,59,0,1,1.0,3.0,0,0,2.0,1,3.0,0.0,3,0,1
1730,1,30,1,129,1,1,2.497548,3.0,2,0,1.362583,0,1.144076,0.173232,2,3,3
1690,1,25,1,120,1,1,3.0,3.0,2,0,3.0,0,1.467863,0.343635,2,3,3


In [30]:
df2["SCC"].unique()

array(['no', 'yes'], dtype=object)

In [62]:
df2[["Age","Height", "Weight"]] = df2[["Age","Height", "Weight"]].astype(int)

In [31]:
df2["Gender"] = df2["Gender"].apply(lambda x : 1 if x == "Male" else 0)

In [32]:
df2["SMOKE"] = df2["SMOKE"].apply(lambda x : 1 if x == "yes" else 0)

In [33]:
df2["family_history_with_overweight"] = df2["family_history_with_overweight"].apply(lambda x : 1 if x == "yes" else 0)

In [34]:
df2["FAVC"] = df2["FAVC"].apply(lambda x : 1 if x == "yes" else 0)

In [35]:
df2["SCC"] = df2["SCC"].apply(lambda x : 1 if x == "yes" else 0)

In [53]:
df2

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,21,1,64,1,0,2.0,3.0,2,0,2.000000,0,0.000000,1.000000,3,3,1
1,0,21,1,56,1,0,3.0,3.0,2,1,3.000000,1,3.000000,0.000000,2,3,1
2,1,23,1,77,1,0,2.0,3.0,2,0,2.000000,0,2.000000,1.000000,1,3,1
3,1,27,1,87,0,0,3.0,3.0,2,0,2.000000,0,2.000000,0.000000,1,4,5
4,1,22,1,89,0,0,2.0,1.0,2,0,2.000000,0,0.000000,0.000000,2,3,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,0,20,1,131,1,1,3.0,3.0,2,0,1.728139,0,1.676269,0.906247,2,3,4
2107,0,21,1,133,1,1,3.0,3.0,2,0,2.005130,0,1.341390,0.599270,2,3,4
2108,0,22,1,133,1,1,3.0,3.0,2,0,2.054193,0,1.414209,0.646288,2,3,4
2109,0,24,1,133,1,1,3.0,3.0,2,0,2.852339,0,1.139107,0.586035,2,3,4


In [37]:
df2["MTRANS"].unique()

array(['Public_Transportation', 'Walking', 'Automobile', 'Motorbike',
       'Bike'], dtype=object)

In [38]:
label_encoder = LabelEncoder()

In [39]:
df2["CAEC"] =label_encoder.fit_transform(df2["CAEC"])

In [40]:
df2["CALC"] = label_encoder.fit_transform(df2["CALC"])

In [41]:
df2["MTRANS"] = label_encoder.fit_transform(df2["MTRANS"])

In [42]:
df2["NObeyesdad"] = label_encoder.fit_transform(df2["NObeyesdad"])

In [43]:
df2.sample(5)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
97,0,21.0,1.52,42.0,0,0,3.0,1.0,1,0,1.0,0,0.0,0.0,2,3,0
673,0,18.874591,1.533609,41.669346,0,1,2.762325,1.163666,2,0,1.30491,0,0.25289,1.001405,2,3,0
408,1,20.0,1.81,79.0,1,0,3.0,1.0,2,0,2.0,0,0.0,0.0,2,3,1
280,1,21.0,1.75,62.0,0,1,3.0,4.0,1,1,2.0,0,0.0,0.0,2,3,1
598,1,18.0,1.70653,51.121749,1,1,2.0,3.0,2,0,2.0,0,0.0,1.329237,2,3,0


In [44]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   int64  
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   int64  
 5   FAVC                            2111 non-null   int64  
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   int32  
 9   SMOKE                           2111 non-null   int64  
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   int64  
 12  FAF                             21

In [56]:
import pandas as pd

# Create a sample DataFrame
data = {
    'Col1': [1.0, 2.5, 3.2],
    'Col2': [4.7, 5.2, 6.0],
    'Col3': [7.3, 8.1, 9.6]
}
df = pd.DataFrame(data)

# Specify the columns to convert
columns_to_convert = ['Col1', 'Col2', 'Col3']

# Convert the specified columns from float to int
df[columns_to_convert] = df[columns_to_convert].astype(int)

# Print the updated DataFrame with converted data types
print(df.dtypes)
print(df)


Col1    int32
Col2    int32
Col3    int32
dtype: object
   Col1  Col2  Col3
0     1     4     7
1     2     5     8
2     3     6     9
