In [None]:
#importing necessary modules

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Creating dataframe from given dataset
df = pd.read_csv("dailyActivity_merged.csv")    

In [None]:
print(df.shape)     #Checkig the shape of the dataframe
df.head()       #Displaying the first 5 rows of the dataframe

In [None]:
df.sample(10)   #Displaying 10 random rows of the dataframe 

In [None]:
#Checking for null values --> Data has no null values
df.count()

In [None]:
df.info()

In [None]:
#Coverting the 'ActivityDate' column to datetime format
df['ActivityDate'] = pd.to_datetime(df['ActivityDate'])

In [None]:
df.describe()

• The data is recorded from 2016-03-12 to 2016-04-12 <br>
• There are multiple users

In [None]:
#Correlation of different features with Target Column (Calories)
df.corr()


• TrackerDistance and TotalDistance have almost same values and distribution i.e. they behave same or are app. duplicate <br>
=> Tracker is highly accurate in accounting the distance

• Calories have high corr with :
    TotalDistance > TrackerDistance > TotalSteps > VeryActiveMinutes > LightActiveDistance > VeryActiveDistance

• Calories have low to negligible corr with :
    ActivityDate < SedentaryMinutes < SedentaryActiveMinutes < LoggedActivityDistance 

• There is huge difference in values of Total/Tracker - Distance and LoggedActivityDistance
=> High inaccuracy in manual record entries by user or user does not regularly log his/her activity (many zero values in database)





In [None]:
#Plotting scatterplots against target column (Calories)

skip_cols = ['Id','ActivityDate','Calories']
num_cols = [col for col in df.select_dtypes(include='number').columns if col not in skip_cols]

for col in num_cols :
    plt.figure(figsize=(4,2))
    sns.scatterplot(data=df, x='Calories', y=col)
    plt.title(f"BoxPlot of {col} by Calories")
    plt.tight_layout()
    plt.show()



**TAKEAWAYs** <br>
• Argueably linear relation with TotalSteps with some outliers <br>
• Somewhat linear relation with TotalDistance but formation of a large cluster (1000,3000) with noticeable outliers <br>
• Argueably linear relation with TrackerDistance with some outliers <br>
• Majority values are 0 for LoggedActivityDistance <br>
• VeryActiveDistane has many 0 values <br>
• ModeratelyActiveDistance has 0s but more scattered values than VeryActiveDistance <br>
• LightActiveDistance is highly scattered with argueable linearity <br>
• SedentaryActiveDistance is majorly Null <br>
• VeryActiveMinutes has many 0 values along with significant number of non zero values throughout the data <br>
• ModeratelyActiveMinutes is more scattered than VeryActiveMinutes <br>
• LightlyActiveMinutes is well scattered <br>
• SedentaryActiveMinutes is well scattered with noticeable outliers <br>

In [None]:
sns.boxplot(x=df['VeryActiveMinutes'])

• VeryActiveMinites beyond 75 minutes are outliers 

In [None]:
sns.scatterplot(y=df['TotalSteps'], x= df['TotalDistance'])

• TotalSteps and TotalDitance have almost linear relation (similar to y = x) with some outliers
=> The 2 can be combined into a single feature

In [None]:
#Lineplot of Calories over time
plt.figure(figsize=(12, 4))
sns.lineplot(x='ActivityDate', y='Calories', data=df)
plt.title('Calories Burned Over Time')
plt.xlabel('Date')
plt.ylabel('Calories')
plt.show()

#Lineplot shows that multiple users exists

In [None]:
df['Id'].nunique()
#We have 35 users

In [None]:
user_summary = df.groupby('Id').agg({
    'Calories': ['mean', 'sum'],
    'TotalSteps': ['mean', 'sum'],
    'VeryActiveMinutes': ['mean', 'sum'],
    'ActivityDate': ['min', 'max', 'count']
}).reset_index()

In [None]:
some_user = df['Id'].unique()[0]  # Pick first user for example

user_df = df[df['Id'] == some_user].sort_values('ActivityDate')

plt.figure(figsize=(12, 4))
sns.lineplot(data=user_df, x='ActivityDate', y='Calories')
plt.title(f'Calories Over Time for User {some_user}')
plt.show()

In [None]:
#Lineplot of all users calories over time

sns.lineplot(data=df, x='ActivityDate', y='Calories', hue='Id', linewidth=0.5)

plt.title("Calories Burned Over Time (Per User)")
plt.xlabel("ActivityDate")
plt.ylabel("Calories")
plt.figure(figsize=(28,80))
#plt.legend(title='User ID', bbox_to_anchor=(1.05, 1), loc='upper left', ncol=1)
plt.tight_layout()
plt.show()

In [None]:
#Scatterplot of Total Steps vs Calories per User
user_totals = df.groupby('Id')[['Calories', 'TotalSteps']].sum().reset_index()

sns.scatterplot(data=user_totals, x='TotalSteps', y='Calories')
plt.title("Total Steps vs Calories per User")
plt.show()

• There is no linearity in Steps vs Calories for different users
=> Significant activity difference between two users

• Calories burned usually increases for users in range 0 - 100000 steps and remains almost stagnant in range 100000 - 200000 steps. Rest could be considered outliers
=> More steps != More calories burned

In [None]:
#Scatterplot of Total Distance vs Calories per User
user_totals = df.groupby('Id')[['Calories', 'TotalDistance']].sum().reset_index()

sns.scatterplot(data=user_totals, x='TotalDistance', y='Calories')
plt.title("Total Distance vs Calories per User")
plt.show()

In [None]:
#Scatterplot of Total VeryActiveDistance vs Calories per User
user_totals = df.groupby('Id')[['Calories', 'VeryActiveDistance']].sum().reset_index()

sns.scatterplot(data=user_totals, x='VeryActiveDistance', y='Calories')
plt.title("Total Very-Active-Distance vs Calories per User")
plt.show()

• Too scattered to make any porper judgement but usually more distance --> more calories burned

In [None]:
#Scatterplot of Total VeryActiveMinutes vs Calories per User
user_totals = df.groupby('Id')[['Calories', 'VeryActiveMinutes']].sum().reset_index()

sns.scatterplot(data=user_totals, x='VeryActiveMinutes', y='Calories')
plt.title("Total Very-Active-Minutes vs Calories per User")
plt.show()

• A more linear relation compared to VeryActiveDistance
=> Consistency > Intensity

In [None]:
df['VeryActivityIntensity'] = df['VeryActiveDistance']/df['VeryActiveMinutes']

In [None]:
#Scatterplot of Total VeryActiveIntensity vs Calories per User
user_totals = df.groupby('Id')[['Calories', 'VeryActivityIntensity']].sum().reset_index()

sns.scatterplot(data=user_totals, x='VeryActivityIntensity', y='Calories')
plt.title("Total Very-Activity-Intensity vs Calories per User")
plt.show()

In [None]:
#Scatterplot of Total LightActiveDistance vs Calories per User
user_totals = df.groupby('Id')[['Calories', 'LightActiveDistance']].sum().reset_index()

sns.scatterplot(data=user_totals, x='LightActiveDistance', y='Calories')
plt.title("Total Light-Active-Distance vs Calories per User")
plt.show()

• Fairly linear relation with some outliers --> Polynomial would be a better fit

In [None]:
#Scatterplot of Total LightActiveMinutes vs Calories per User
user_totals = df.groupby('Id')[['Calories', 'LightlyActiveMinutes']].sum().reset_index()

sns.scatterplot(data=user_totals, x='LightlyActiveMinutes', y='Calories')
plt.title("Total Lightly-Active-Minutes vs Calories per User")
plt.show()

• Calories burned do increases with increase of LightlyActiveMinutes but relation cannot be termed linear, grouping or clustering seems more viable than linear / polynomial

In [None]:
df['StepsIntensity'] = df['TotalDistance']/df['TotalSteps']

In [None]:
#Scatterplot of Total StepsIntensity vs Calories per User
user_totals = df.groupby('Id')[['Calories', 'StepsIntensity']].sum().reset_index()

sns.scatterplot(data=user_totals, x='StepsIntensity', y='Calories')
plt.title("Steps-Intensity vs Calories per User")
plt.show()

• **Greatly linear** relation between calories burned and StepsIntensity

In [None]:
df['SedentaryIntensity'] = df['SedentaryActiveDistance']/df['SedentaryMinutes']

In [None]:
#Scatterplot of Total Seden vs Calories per User
user_totals = df.groupby('Id')[['Calories', 'SedentaryIntensity']].sum().reset_index()

sns.scatterplot(data=user_totals, x='SedentaryIntensity', y='Calories')
plt.title("Total Sedentary-Intensity vs Calories per User")
plt.show()

In [None]:
df['LightActivityIntensity'] = df['LightActiveDistance']/df['LightlyActiveMinutes']

In [None]:
#Scatterplot of Total LightActiveIntensity vs Calories per User
user_totals = df.groupby('Id')[['Calories', 'LightActivityIntensity']].sum().reset_index()

sns.scatterplot(data=user_totals, x='LightActivityIntensity', y='Calories')
plt.title("Total Light-Activity-Intensity vs Calories per User")
plt.show()

In [None]:
df['ModerateActivityIntensity'] = df['ModeratelyActiveDistance']/df['FairlyActiveMinutes']

In [None]:
#Scatterplot of Total Distance vs Calories per User
user_totals = df.groupby('Id')[['Calories', 'ModerateActivityIntensity']].sum().reset_index()

sns.scatterplot(data=user_totals, x='ModerateActivityIntensity', y='Calories')
plt.title("Total Moderate-Activity-Intensity vs Calories per User")
plt.show()

In [None]:
user_summary

In [None]:
df.head()