In [2]:
# Import the libraries
import pandas as pd # This library helps us to work with data in tables
import numpy as np # This library helps us to do some math calculations
import matplotlib.pyplot as plt # This library helps us to make graphs
import seaborn as sns # This library helps us to make graphs look nicer
import re # This library helps us to find patterns in text

# Define the log file path
log_file = "HealthApp/HealthApp_2k.log" # This is the name of the file that has the data we want to analyze

# Define the regex pattern to parse the log file
pattern = r"(?P<user_id>\d+), (?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}), (?P<activity>\w+), (?P<step_count>\d+), (?P<heart_rate>\d+)" # This is a special way of writing what the data looks like in the file

# Create an empty list to store the parsed data
data = [] # This is where we will put the data after we read it from the file

# Open the log file and read it line by line
with open(log_file, "r") as f: # This is how we open the file and give it a name f
    for line in f: # This is how we loop through each line in the file
        # Match the regex pattern with the line
        match = re.search(pattern, line) # This is how we check if the line matches the pattern we defined
        if match: # This is how we do something only if the line matches the pattern
            # Extract the data from the match object
            user_id = match.group("user_id") # This is how we get the user ID from the line
            timestamp = match.group("timestamp") # This is how we get the timestamp from the line
            activity = match.group("activity") # This is how we get the activity from the line
            step_count = match.group("step_count") # This is how we get the step count from the line
            heart_rate = match.group("heart_rate") # This is how we get the heart rate from the line
            # Append the data as a dictionary to the list
            data.append({"user_id": user_id, "timestamp": timestamp, "activity": activity, "step_count": step_count, "heart_rate": heart_rate}) # This is how we add the data to the list as a dictionary

# Convert the list to a pandas dataframe
df =  pd.read_csv(log_file, sep='|', dtype={"user_id": int, "username": "string"}) # This is how we turn the list into a table using pandas

# Convert the timestamp column to datetime format
df["times"] = pd.to_datetime(df["times"], format='%Y%m%d-%H:%M:%S:%f') # This is how we change the timestamp column to a date and time format using pandas

# Perform some data analysis and statistics
# For example, you can calculate the average step count and heart rate for each user and activity
avg_df = df.groupby(["user_id", "activity"]).agg({"step_count": "mean", "heart_rate": "mean"}).reset_index() # This is how we group the data by user and activity and calculate the average step count and heart rate using pandas

# Create some graphs and charts to visualize the data and the analysis results
# For example, you can create a bar chart to compare the average step count and heart rate for each user and activity
sns.barplot(x="user_id", y="step_count", hue="activity", data=avg_df) # This is how we make a bar chart using seaborn
plt.title("Average Step Count by User and Activity") # This is how we add a title to the graph using matplotlib
plt.xlabel("User ID") # This is how we add a label to the x-axis using matplotlib
plt.ylabel("Step Count") # This is how we add a label to the y-axis using matplotlib
plt.show() # This is how we show the graph using matplotlib

sns.barplot(x="user_id", y="heart_rate", hue="activity", data=avg_df) # This is how we make another bar chart using seaborn
plt.title("Average Heart Rate by User and Activity") # This is how we add a title to the graph using matplotlib
plt.xlabel("User ID") # This is how we add a label to the x-axis using matplotlib
plt.ylabel("Heart Rate") # This is how we add a label to the y-axis using matplotlib
plt.show() # This is how we show the graph using matplotlib


ParserError: Error tokenizing data. C error: Expected 4 fields in line 1794, saw 7
