#Logistic Regression (Regression and Classification)

Connect to Google Drive

In [2]:
#Mount the google drive connection to our dataset
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Load the dataset

In [3]:
import pandas as pd
df = pd.read_csv('/content/drive/My Drive/AI/Project 1/dataset/obesity_data.csv')

In [4]:
df

Unnamed: 0,Age,Gender,Height,Weight,BMI,PhysicalActivityLevel,ObesityCategory
0,56,Male,173.575262,71.982051,23.891783,4,Normal weight
1,69,Male,164.127306,89.959256,33.395209,2,Obese
2,46,Female,168.072202,72.930629,25.817737,4,Overweight
3,32,Male,168.459633,84.886912,29.912247,3,Overweight
4,60,Male,183.568568,69.038945,20.487903,3,Normal weight
...,...,...,...,...,...,...,...
995,18,Male,155.588674,64.103182,26.480345,4,Overweight
996,35,Female,165.076490,97.639771,35.830783,1,Obese
997,49,Female,156.570956,78.804284,32.146036,1,Obese
998,64,Male,164.192222,57.978115,21.505965,4,Normal weight


Encode categorical variables

In [5]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

Develop the Preprocessor of Encoding

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('gender_onehot', OneHotEncoder(), ['Gender']),
        ('obesity_ordinal', OrdinalEncoder(), ['ObesityCategory'])
    ],
    remainder='passthrough'
)


Develop the Pipeline

In [7]:
pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

In [8]:
df_transformed = pipeline.fit_transform(df)

In [9]:
gender_categories = pipeline.named_steps['preprocessor'].named_transformers_['gender_onehot'].get_feature_names_out(input_features=['Gender'])
obesity_categories = ['ObesityCategory']

In [10]:
new_column_names = list(gender_categories) + obesity_categories + list(df.columns.drop(['Gender', 'ObesityCategory']))

In [11]:
df_transformed = pd.DataFrame(df_transformed, columns=new_column_names)
df_transformed

Unnamed: 0,Gender_Female,Gender_Male,ObesityCategory,Age,Height,Weight,BMI,PhysicalActivityLevel
0,0.0,1.0,0.0,56.0,173.575262,71.982051,23.891783,4.0
1,0.0,1.0,1.0,69.0,164.127306,89.959256,33.395209,2.0
2,1.0,0.0,2.0,46.0,168.072202,72.930629,25.817737,4.0
3,0.0,1.0,2.0,32.0,168.459633,84.886912,29.912247,3.0
4,0.0,1.0,0.0,60.0,183.568568,69.038945,20.487903,3.0
...,...,...,...,...,...,...,...,...
995,0.0,1.0,2.0,18.0,155.588674,64.103182,26.480345,4.0
996,1.0,0.0,1.0,35.0,165.076490,97.639771,35.830783,1.0
997,1.0,0.0,1.0,49.0,156.570956,78.804284,32.146036,1.0
998,0.0,1.0,0.0,64.0,164.192222,57.978115,21.505965,4.0


In [12]:
df

Unnamed: 0,Age,Gender,Height,Weight,BMI,PhysicalActivityLevel,ObesityCategory
0,56,Male,173.575262,71.982051,23.891783,4,Normal weight
1,69,Male,164.127306,89.959256,33.395209,2,Obese
2,46,Female,168.072202,72.930629,25.817737,4,Overweight
3,32,Male,168.459633,84.886912,29.912247,3,Overweight
4,60,Male,183.568568,69.038945,20.487903,3,Normal weight
...,...,...,...,...,...,...,...
995,18,Male,155.588674,64.103182,26.480345,4,Overweight
996,35,Female,165.076490,97.639771,35.830783,1,Obese
997,49,Female,156.570956,78.804284,32.146036,1,Obese
998,64,Male,164.192222,57.978115,21.505965,4,Normal weight


In [13]:
original_gender_values = df['Gender'].unique()
transformed_gender_values = df_transformed.filter(like='Gender_').columns

original_obesity_values = df['ObesityCategory'].unique()
transformed_obesity_values = df_transformed['ObesityCategory'].unique()

print("Original 'Gender' values:", original_gender_values)
print("Transformed 'Gender' values:", transformed_gender_values)

print("\nOriginal 'ObesityCategory' values:", original_obesity_values)
print("Transformed 'ObesityCategory' values:", transformed_obesity_values)

Original 'Gender' values: ['Male' 'Female']
Transformed 'Gender' values: Index(['Gender_Female', 'Gender_Male'], dtype='object')

Original 'ObesityCategory' values: ['Normal weight' 'Obese' 'Overweight' 'Underweight']
Transformed 'ObesityCategory' values: [0. 1. 2. 3.]


In [20]:
df_transformed

Unnamed: 0,Gender_Female,Gender_Male,ObesityCategory,Age,Height,Weight,BMI,PhysicalActivityLevel
0,0.0,1.0,0.0,56.0,173.575262,71.982051,23.891783,4.0
1,0.0,1.0,1.0,69.0,164.127306,89.959256,33.395209,2.0
2,1.0,0.0,2.0,46.0,168.072202,72.930629,25.817737,4.0
3,0.0,1.0,2.0,32.0,168.459633,84.886912,29.912247,3.0
4,0.0,1.0,0.0,60.0,183.568568,69.038945,20.487903,3.0
...,...,...,...,...,...,...,...,...
995,0.0,1.0,2.0,18.0,155.588674,64.103182,26.480345,4.0
996,1.0,0.0,1.0,35.0,165.076490,97.639771,35.830783,1.0
997,1.0,0.0,1.0,49.0,156.570956,78.804284,32.146036,1.0
998,0.0,1.0,0.0,64.0,164.192222,57.978115,21.505965,4.0


In [14]:
from sklearn.model_selection import train_test_split

X = df_transformed.drop(['ObesityCategory'], axis=1)
y = df_transformed['ObesityCategory']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

Implement Logistic Regressen

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

In [16]:
logreg_model = LogisticRegression(max_iter=5000)
logreg_model.fit(X_train, y_train)

Loss Function => Log Loss

In [17]:
from sklearn.model_selection import cross_val_score

y_val_pred_proba = logreg_model.predict_proba(X_val)

log_loss_val = log_loss(y_val, y_val_pred_proba)
print("Log Loss on Validation Set:", log_loss_val)

y_test_pred_proba = logreg_model.predict_proba(X_test)

log_loss_test = log_loss(y_test, y_test_pred_proba)
print("Log Loss on Test Set:", log_loss_test)

cv_scores = cross_val_score(logreg_model, X, y, cv=5, scoring='neg_log_loss')
average_cv_score = -cv_scores.mean()
print("Average Log Loss with Cross-Validation:", average_cv_score)

Log Loss on Validation Set: 0.06622219124687449
Log Loss on Test Set: 0.05947385009644373
Average Log Loss with Cross-Validation: 0.04985626350316963


In [24]:
values={
    'Gender_Female': 1.0,
    'Gender_Male': 0.0,
    'Age': 37.0,
    'Height': 168.451241,
    'Weight': 78.641231,
    'BMI': 27.516212,
    'PhysicalActivityLevel': 3.0
}

df_for_test = pd.DataFrame([values])

In [27]:
predictions = logreg_model.predict(df_for_test)
print("Obesity Category value: ",predictions)

Obesity Category value:  [2.]
