[Reference](https://devqueue.medium.com/analyzing-personal-web-activity-5df9cf1bdef5)

# Getting History Data


[Google Takeout](https://takeout.google.com/?pli=1)

In [None]:
with open("BrowserHistory.json") as f:
    data = json.loads(f.read())
    df = pd.DataFrame(data["Browser History"])

In [None]:
df.info()

In [None]:
df.drop('client_id', axis=1, inplace=True)
df.drop('favicon_url', axis=1, inplace=True)

In [None]:
df['page_transition'].unique()

In [None]:
df = df[(df['page_transition'] == "LINK") | (df['page_transition'] == "TYPED") | (df['page_transition'] == "RELOAD") |(df['page_transition'] == "GENERATED")]

# Extracting new features:


In [None]:
import datetime

def time_converter(x):
    return datetime.datetime.fromtimestamp(x/1000000)
    
df['date_time'] = df['time_usec'].apply(time_converter)

In [None]:
df.head(4)

In [None]:
df['year'] = pd.DatetimeIndex(df.date_time).year
df['month'] = pd.DatetimeIndex(df.date_time).month
df['day'] = pd.DatetimeIndex(df.date_time).day
df['hour'] = pd.DatetimeIndex(df.date_time).hour

## Function 1: This function will return the domain in plain text out of a link



In [None]:
import tldextract

def return_domain(x):
    domain = tldextract.extract(x)[1]
    sub_domain = tldextract.extract(x)[0]
    if sub_domain == "mail":
        return sub_domain + "." + domain
    # To differentiate b/w drive.google.com and google.com
    if domain == "google" and sub_domain=="www": 
        return "google_search" 
    return domain

## Function 2: Returns the category of a particular domain. I’ve separated them into learning, News read, social media, other

In [None]:
def return_category(x):
    if x in ["pluralsight", "w3schools", "geeks4geeks", "freecodecamp", "jovian", "stackoverflow", "kodekloud", "teachable", "pynative","realpython",]:
        return "Learning"
    elif x in ["9to5google", "theverge", "sciencedaily", "digitaltrends", "towardsdatascience", "geekblooging"]:
        return "Newsreads"
    elif x in ["youtube", "instagram", "facebook", "twitter", "pinterest", "discord", "whatsapp", "snapchat" ]:
        return "social media"
    else:
        return "Other"
# Cluster popular domains into a category
df['domain'] = df['url'].apply(return_domain)
df['category'] = df['domain'].apply(return_category)

# Exploratory Analysis and Visualization


In [None]:
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline

## 1. Most visited page transition:


In [None]:
plt.title("Distribution of pages")
plt.hist(df.page_transition,color='darkorange');

## 2. Most active time:


In [None]:
df_heat = df.groupby(["hour", "day"])["url"].size().reset_index()
df_heat2 = df_heat.pivot("hour", "day", "url")
g = sns.heatmap(df_heat2, cmap='Blues')
g.invert_yaxis()
plt.show()

## 3. Category Pie chart


In [None]:
plt.figure(figsize = (5,5))
df['category'].value_counts().plot(kind='pie',autopct='%1.1f%%',shadow=True)
plt.show()

## 4. Total youtube links visited


In [None]:
df_youtube = df[(df['domain'] == "youtube") | df['year'] == 2020]
# Plots
plt.figure(figsize=(14,8))
plt.title("Youtube link visits")
sns.countplot(x='month',data=df_youtube, palette=['#432371',"#FAAE7B"]);