In [20]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

In [21]:
df = pd.read_csv(r'data/train.csv')

In [22]:
df.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [23]:
df['Age'] = df['Age'].astype(int)
df[['Height', 'Weight', 'CH2O']] = df[['Height', 'Weight', 'CH2O']].round(2)
df[['FCVC', 'NCP', 'TUE', 'FAF']] = df[['FCVC', 'NCP', 'TUE', 'FAF']].round().astype(int)

In [24]:
num_non_smoke = df['SMOKE'].value_counts()['no']
num_smoke = df['SMOKE'].value_counts()['yes']

print(f"Num of Smokers: {num_non_smoke}")
print(f"Num of Smokers: {num_smoke}")

Num of Smokers: 20513
Num of Smokers: 245


In [25]:
num_male = df['Gender'].value_counts()['Male']
num_female = df['Gender'].value_counts()['Female']

print(f"Num of Male: {num_male}")
print(f"Num of Female: {num_female}")

Num of Male: 10336
Num of Female: 10422


In [26]:
fig = go.Figure()

bins = [0, 20, 50, 70, 90, 120, 160]
freq, _ = np.histogram(df['Weight'], bins=bins)

bin_centers = np.diff(bins) * 0.5 + bins[:-1]

for fr, x in zip(freq, bin_centers):
    height = int(fr)
    fig.add_annotation(
    x=x,
    y=height,  # Keep the y coordinate of the bar
    yref='y',  # Use the same reference frame as the y-axis
    yshift=10,  # Adjust the y position to move the text above the bar
    text="{}%".format(round(height * 100 / df.shape[0], 2)),
    showarrow=False,
    font=dict(color='black', size=12)
)

fig.add_trace(
    go.Bar(
        x=bin_centers,
        y=freq,
        width=0.9 * np.diff(bins),
        marker=dict(color='blue')
    )
)

fig.update_traces(textposition="outside")

fig.update_layout(
    title='Weight Distribution',
    xaxis=dict(title='Weight'),
    yaxis=dict(title='Frequency'),
    bargap=0.05,
    xaxis_tickvals=[0, 20, 50, 70, 90, 120, 160]
)

fig.show()

- For vegetables with Weight and heigh Calories

In [27]:
grouped_data = df.groupby('FCVC')['Weight'].mean().reset_index()
grouped_data

Unnamed: 0,FCVC,Weight
0,1,80.977669
1,2,82.052683
2,3,94.135144


In [31]:
# Group data by frequency of consuming vegetables and calculate average weight
grouped_data = df.groupby('FCVC')['Weight'].mean().reset_index()

# Calculate percentages
total_count = df['FCVC'].count()
grouped_data['Percentage'] = (grouped_data['Weight'] / total_count) * 100

# Create bar plot
fig = go.Figure(data=[go.Bar(
    x=grouped_data['FCVC'],
    y=grouped_data['Weight'],
    text=grouped_data['Percentage'].round(2).astype(str) + '%',
    textposition='outside'
)])


# Add labels and title
fig.update_layout(
    title='Average Weight by Vegetable Consumption Frequency',
    xaxis=dict(title='Frequency of Consuming Vegetables'),
    yaxis=dict(title='Average Weight')
)

# Show plot
fig.show()

- between physical activity and transportation

In [34]:
grouped_data = df.groupby(['FAF', 'MTRANS']).size().reset_index(name='count')
grouped_data

Unnamed: 0,FAF,MTRANS,count
0,0,Automobile,1150
1,0,Bike,6
2,0,Motorbike,11
3,0,Public_Transportation,5682
4,0,Walking,72
5,1,Automobile,1439
6,1,Bike,13
7,1,Motorbike,16
8,1,Public_Transportation,6660
9,1,Walking,136


In [36]:
# Group data by physical activity and transportation, and count occurrences
grouped_data = df.groupby(['FAF', 'MTRANS']).size().unstack(fill_value=0)

# Create stacked bar chart
fig = go.Figure()

for activity in grouped_data.columns:
    fig.add_trace(go.Bar(
        x=grouped_data.index,
        y=grouped_data[activity],
        name=activity
    ))

# Add labels and title
fig.update_layout(
    title='Physical Activity vs Transportation (Stacked)',
    xaxis=dict(title='Physical Activity'),
    yaxis=dict(title='Count'),
    barmode='stack'
)

# Show plot
fig.show()

In [39]:
import plotly.figure_factory as ff

# Correlation matrix
correlation_matrix = df.corr()

# Get the column and index names
column_names = correlation_matrix.columns.tolist()
index_names = correlation_matrix.index.tolist()

# Convert correlation matrix to a numpy array
correlation_matrix_array = correlation_matrix.to_numpy()

# Create heatmap figure
fig = ff.create_annotated_heatmap(
    z=correlation_matrix_array,
    x=column_names,
    y=index_names,
    colorscale='RdBu',
    showscale=True,
    zmin=-1,
    zmax=1,
    annotation_text=correlation_matrix.applymap(lambda x: f'{x:.2f}').values.tolist()  # Format annotation values to two decimal points
)

# Update layout
fig.update_layout(
    title='Correlation Matrix Heatmap',
    xaxis=dict(title='Features'),
    yaxis=dict(title='Features'),
    width=800,  # Set the width of the figure
    height=700  # Set the height of the figure
)

# Show plot
fig.show()
