In [1]:
import pandas as pd

# Load the data from the user-provided text into a pandas dataframe
data = ""

file_path = 'Obesity_Dataset/Obesity_Dataset.csv'
with open(file_path, 'r') as file:
    data = file.read()

columns = [
    "Sex", "Age", "Height", "Overweight_Obese_Family", "Consumption_of_Fast_Food",
    "Frequency_of_Consuming_Vegetables", "Number_of_Main_Meals_Daily", "Food_Intake_Between_Meals",
    "Smoking", "Liquid_Intake_Daily", "Calculation_of_Calorie_Intake", "Physical_Exercise",
    "Schedule_Dedicated_to_Technology", "Type_of_Transportation_Used", "Class"
]

# Read the data into a pandas DataFrame
df = pd.read_csv(pd.io.common.StringIO(data), header=None, names=columns)

In [2]:
classes = df['Class'].value_counts()
print("Klassen im Datensatz:")
print(classes)

# Überprüfen, ob der Datensatz ausgeglichen ist
is_balanced = classes.min() / classes.max() > 0.5
print("\nIst der Datensatz ausgeglichen?", is_balanced)

Klassen im Datensatz:
2    658
3    592
4    287
1     73
Name: Class, dtype: int64

Ist der Datensatz ausgeglichen? False


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Handle categorical data by encoding them
le = LabelEncoder()

# Encoding categorical columns that are in string form
df['Sex'] = le.fit_transform(df['Sex'])
df['Overweight_Obese_Family'] = le.fit_transform(df['Overweight_Obese_Family'])
df['Consumption_of_Fast_Food'] = le.fit_transform(df['Consumption_of_Fast_Food'])
df['Frequency_of_Consuming_Vegetables'] = le.fit_transform(df['Frequency_of_Consuming_Vegetables'])
df['Number_of_Main_Meals_Daily'] = le.fit_transform(df['Number_of_Main_Meals_Daily'])
df['Food_Intake_Between_Meals'] = le.fit_transform(df['Food_Intake_Between_Meals'])
df['Smoking'] = le.fit_transform(df['Smoking'])
df['Liquid_Intake_Daily'] = le.fit_transform(df['Liquid_Intake_Daily'])
df['Calculation_of_Calorie_Intake'] = le.fit_transform(df['Calculation_of_Calorie_Intake'])
df['Physical_Exercise'] = le.fit_transform(df['Physical_Exercise'])
df['Schedule_Dedicated_to_Technology'] = le.fit_transform(df['Schedule_Dedicated_to_Technology'])
df['Type_of_Transportation_Used'] = le.fit_transform(df['Type_of_Transportation_Used'])
df['Class'] = le.fit_transform(df['Class'])

# Split the data into features (X) and target (y)
X = df.drop('Class', axis=1)  # Features
y = df['Class']  # Target

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 78.88%

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.85      0.85        13
           1       0.85      0.86      0.85       125
           2       0.78      0.73      0.76       131
           3       0.67      0.75      0.71        53

    accuracy                           0.79       322
   macro avg       0.79      0.80      0.79       322
weighted avg       0.79      0.79      0.79       322


In [6]:
from sklearn.metrics import confusion_matrix

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)


Confusion Matrix:
[[ 11   2   0   0]
 [  1 107  17   0]
 [  1  14  96  20]
 [  0   3  10  40]]
