# Installing Dependency

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import io
import requests
%matplotlib inline

# Data Gathering

In [None]:
# Getting the csv file from the github

url="https://raw.githubusercontent.com/logpai/loghub/master/HealthApp/HealthApp_2k.log_structured.csv"

download=requests.get(url).content

#reading the Downloading Content and change it into the pandas dataframe

df = pd.read_csv(io.StringIO(download.decode('utf-8')))

df.head()

# EDA

In [None]:
df.info() # Data type of Time is Object 

In [None]:
df.describe()

In [None]:
print(df.nunique())
df.drop(['LineId','Time','Pid'],axis=1).nunique().sort_values().plot(kind='bar')

In [None]:
#df['EventTemplate'].value_counts().plot(kind='bar')
print(df['Component'].value_counts())
df['Component'].value_counts().plot(kind='bar')
plt.ylabel("Frequency")

In [None]:
print(df['EventTemplate'].value_counts())
df['EventTemplate'].value_counts().head(15).plot(kind='bar')# representing top 15 event template
plt.ylabel("Frequency")

# Data Preprocessing and Cleaning

In [None]:
df.head(5)

<b>Proper Format Time column</b>

In [None]:
df['Time'] = pd.to_datetime(df['Time'],format='%Y%m%d-%H:%M:%S:%f')

In [None]:
# Separting the Time and Date columns 
df['Date'] = df['Time'].dt.date
df['Hour'] = df['Time'].dt.hour
df['Minute'] = df['Time'].dt.minute
df['Second'] = df['Time'].dt.second

In [None]:
df.columns

In [None]:
# Rearrage the columns
df = df[['LineId', 'Time','Date', 'Hour', 'Minute', 'Second','Component', 'Pid', 'Content', 'EventTemplate','EventId']]
#df.drop('Time',axis=1)

In [None]:
df.head()

In [None]:
df['Pid'].nunique() # ALl the Value in Pid are Same so not neccesary simple remove this column

Content Column and Event Template column almost same because Event Template used for Content Columns

<b>Cleaning Event Template Column and Make a New Column which is a Standardized Columns</b>


In [None]:

df[['EventTemplate', 'Other1']] = df['EventTemplate'].str.split('[:,=,<]', expand=True,n=1)

In [None]:
df.sample(5)

The Content Column tells which action or task is performed on the HealthApp

In [None]:
#Frequency of Event Template use
df['EventTemplate'].value_counts().head(25).plot(kind='bar')

In [None]:
# Transforming the Event Column into the separate column standardized Event Tell which event is used 

def map_events(event_template):
    # Mapping dictionary
    Template_event_mapping = {
    'setTodayTotalDetailSteps': 'Total Detail Steps',
    'calculateCaloriesWithCache totalCalories': 'Total Calories',
    'calculateAltitudeWithCache totalAltitude': 'Total Altitude',
    'onStandStepChanged': 'Total Stand Steps',
    'getTodayTotalDetailSteps': 'Total Detail Steps',
    'processHandleBroadcastAction action':'Brodcast Action',
    'onReceive action':'Receive Action',
    'onExtend': 'Extend steps change',# Assuimin on Extend is also a some type of Step Change
    'REPORT':'Report',
    'flush sensor data':'Sensor Data',
    'isScreenOn true':'Screen Status',
    'screen status unknown':'Screen Status',
    'getBinderPackageName packageName':'Binder Packages',
    'upLoadOneMinuteDataToEngine time':'EngineDataUploadTime',
    'initDataPrivacy the dataPrivacy switch is open':'Data Privacy',
    'initUserPrivacy the userPrivacy switch is open':'User Privacy',
    'initUserPrivacy the userPrivacy is true':'User Privacy',
    'upLoadOneMinuteDataToEngine time':'EngineDataUploadTime',
    'timeStamp back':'Back TimeStamp',
    'new date': 'New Date Event',
    'InsertCallBack() onSuccess type': 'Insert Callback Success',
    'InsertEvent success begin': 'Insert Event Success',
    'getTodaySportData mStepsRecordManager': 'Get Today Sport Data',
    'theDayChanged': 'Day Changed',
    'initEnviroument': 'Initialize Environment',
    'getStepCounterStatus': 'Get Step Counter Status',
    'reStartStepCounter': 'Restart Step Counter',
    'registersensorsuccess': 'Register Sensor Success',
    'clear()': 'Clear Operation',
    'getTodayBasicStandardSteps': 'Get Today Basic Standard Steps',
    'getDiffTotalSteps': 'Get Different Total Steps',
    'setDiffTotalSteps':'Get Different Total Steps'
    }
    
    if 'Sync' in event_template:
        return 'Sync Status'
    if 'DB' in event_template:
        return 'DB Status'
    if 'Health' in event_template:
        return 'Health Data Status'
    if 'DataPrivacy' in event_template:
        return 'Data Privacy'
    
    if 'UserPrivacy' in event_template:
        return 'User Privacy'
    
    if 'BasicStep'in event_template:
        return 'Basic Step Status'
    if 'StandardSteps' in event_template:
        return 'Standard Step Status'
    
    return Template_event_mapping.get(event_template.strip(), event_template)

# Apply the mapping function
df['StandardizedEvent'] = df['EventTemplate'].apply(map_events)


In [None]:
print(len(df['StandardizedEvent'].unique()))
df['StandardizedEvent'].value_counts()

<b>Working with Content Columns</b>

In [None]:
df.sample(4)

In [None]:
# Define a function to remove content from Content that is present in EventTemplate
def remove_content(row):
    for item in row['EventTemplate']:
        row['Content']=row['Content'].replace(item,'')
    return row['Content']

df['Content'] = df.apply(remove_content,axis=1)

In [None]:
df['Content'].value_counts().head(30).plot(kind='bar')

In [None]:
df.sample(7)

In [None]:
df.isnull().sum()

In [None]:
# making the Content column is better
df['Content'] = df['Content'].str.replace('[^a-zA-Z0-9\s]', '', regex=True)

df.head()

<b>Cleaning the Component Columns</b>

In [None]:
df[['Step','Component']] = df['Component'].str.split('_',expand=True,n=1)

In [None]:
df.sample(10)

Droping the Some Column which is not necesaary to keep 

Pid Column contain only one unique value which is same among all of them

Time Column is remove because We Split this Data into separate column 
Step Column is no use i have find what componenet are present there
Event Template we create a Standard Event Template Column

In [None]:

df.drop(['Other1','EventId','Time','Pid','Step','EventTemplate'],axis=1,inplace=True)

In [None]:
df

# Data Analysis

In [None]:
df.sample(5)

In [None]:
df.info()

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.offline as pyo
import pandas as pd


value_counts = df['StandardizedEvent'].value_counts()

# Creating subplot
fig = make_subplots(rows=1, cols=1)

# Add a bar trace
fig.add_trace(go.Bar(x=value_counts.index, y=value_counts.values), row=1, col=1)

# Set axis labels
fig.update_layout(xaxis_title='Standardized Event', yaxis_title='Frequency of Event Template')

fig.update_layout(title_text='Distribution of Standardized Events',title_x=0.5, title_y=0.95)

# Show the plot in Jupyter Notebook
pyo.iplot(fig, validate=False)


Among all the Template Total Details steps is mostly use

In [None]:
Frequency_of_component=df['Component'].value_counts()

#Create sub PLot
fig =make_subplots(rows=1,cols=1)

fig.add_trace(go.Bar(x=Frequency_of_component.index,y=Frequency_of_component.values,),row=1,col=1)

fig.update_layout(xaxis_title='Componenet Use',yaxis_title='Frequency')

fig.update_layout(title_text='Distribution of Component',title_x=0.5, title_y=0.95)



Distribution of Componet among all the componenet Largest use is LSC

In [None]:
import plotly.express as px

df['DateTime'] = pd.to_datetime(df[['Date', 'Hour', 'Minute', 'Second']].astype(str).agg(' '.join, axis=1), format='%Y-%m-%d %H %M %S')

# Create a scatter plot
fig = px.scatter(df, x='DateTime', y='Component', color='StandardizedEvent',
                 title='Scatter Plot of Component vs. DateTime with StandardizedEvent',
                 labels={'DateTime': 'Date and Time', 'Component': 'Component'},
                 category_orders={'StandardizedEvent': df['StandardizedEvent'].unique()},
                 width=1200, height=600)

# Show the plot
fig.show()


In [None]:
fig = px.line(df, x='DateTime', y='Component', color='StandardizedEvent',
              title='Line Plot of Component vs. DateTime with StandardizedEvent',
              labels={'DateTime': 'Date and Time', 'Component': 'Component'},
              category_orders={'StandardizedEvent': df['StandardizedEvent'].unique()},
              width=1200, height=600)

# Show the plot
fig.show()

Above Graph tell which componenet and Which template is uded on the given date and time
We take a component Because all the content category falls under any of this 

i.e above graph tells us which component used at which time stamp and what Template are use

In [None]:
import pandas as pd
import plotly.express as px

# Extract unique actions from the Content column
unique_actions = df['StandardizedEvent'].unique()

# Count the occurrences of each action
action_counts = df['StandardizedEvent'].value_counts()

# Creating bar chart for top N actions
top_n = 20 
fig = px.bar(action_counts.head(top_n), x=action_counts.head(top_n).index, y=action_counts.head(top_n).values,color=action_counts.head(top_n).index)
fig.update_layout(xaxis_title='Action', yaxis_title='Frequency',title_text='Top Actions Performed',title_x=0.5,title_y=0.95)
fig.show()


Which Action performed most in all the content

In [None]:
df.nunique()

We can make a hieracy of component standardized event and Content

In [None]:
fig = px.sunburst(df, path=['Component', 'StandardizedEvent','Content'])
fig.update_layout(title_text='Sunburst Chart of Component, StandardizedEvent, and Content Hierarch',title_x=0.5,title_y=0.95)
fig.show()


Using a sunburst chart to show hierarchical data, such as actions nested within components, standardized event which task  is performed tell by the content columns

In [None]:
df.head()

In [None]:
df.nunique()

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

text_data = ' '.join(df['Content'])
wordcloud = WordCloud(width=800, height=400, random_state=21, max_font_size=110).generate(text_data)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()


Above Word Cloud tell which content used mostly th Data presnt in the form of strings

In [None]:
 #Create a line plot
fig = go.Figure([go.Scatter(x=df['DateTime'], y=df['Component'], mode='lines',
                            line=dict(color='Red', width=3),
                            hoverinfo='x+y+text', text=df['StandardizedEvent'])])

fig.update_xaxes(minor=dict(ticks="inside", showgrid=True))

# Update layout for better visualization
fig.update_layout(title='Line Plot of Component Over Time',
                  xaxis_title='Date and Time',
                  yaxis_title='Component',
                  showlegend=False,
                  title_x=0.5,
                  title_y=0.95
                 ) 

# Show the plot
fig.show()

In [None]:
 #Create a line plot
fig = go.Figure([go.Scatter(x=df['DateTime'], y=df['StandardizedEvent'], mode='lines',
                            line=dict(color='Green', width=4),
                            hoverinfo='x+y+text', text=df['StandardizedEvent'])])


fig.update_xaxes(minor=dict(ticks="inside", showgrid=True))

# layout for better visualization
fig.update_layout(title='Line Plot of StandardizedEvent Over Time',
                  xaxis_title='Date and Time',
                  yaxis_title='StandardizedEvent',
                  showlegend=False,
                  title_x=0.5,
                  title_y=0.95
                 )
# Show the plot
fig.show()

## Conclusion

In conclusion Step, We have  analyse so many information from the log data and Find the insights,patterns and trends of loghub healthApp Log Data. Some of the Insight are:

1. **Peak Activity Times:**
   - The most activty occur at the Midnight of the 24 Dec 2017,This is the time most of the taks is performed by the user.

2. **Most Performed Tasks:**
   - Most Frequetly Task is Perfomed is Total Details Steps which has used the component is SPUtils and most of the HealthApp Component use is LSC this is most perfomed task.

3. **Template Utilization:**
   - MOst of the Time Template use is Total Details Steps on the HealthApp 


Above are some points of  analysis but finding a better insight from any of the Data we required Domain Specific Knowledge to get better and Better Insight