In [15]:
import numpy as np
import pandas as pd
from datetime import datetime

In [16]:
# Read blog data and analytics data
blogs = pd.read_csv('../data/blog_text.csv', 
    sep=',', 
    engine='python', 
    parse_dates=['time'], 
    date_parser=lambda col: pd.to_datetime(col, utc=True)
)
analytics = pd.read_csv('../data/google_analytics.csv')

# Clean up category titles
blogs['category'] = blogs['category'].str.replace('&amp;', 'and')

# Convert time strings to datetime format
blogs['time'] = blogs['time'].dt.date

blogs['url'] = 'blog' + blogs['url'].str.split('blog', expand=True)[1].str.rstrip('/')

# Combine the two tables
combined = pd.merge(blogs, analytics, how='inner', on='url')

# Drop entires that do not have urls OR pageviews
combined = combined.dropna(subset=['url', 'pageviews'])

# Infer titles from urls for entries that are missing a title
#   1. Select entries that do not have a title
#   2. For each selected entry take the URL
#   3. Drop the first 5 character ('blog/')
#   4. Replace dashes ('-') with whitespaces
#   5. Use the altered URL as a new title for the entry
mask = combined['title'].isna()
combined.loc[mask, 'title'] = combined['url'][mask].str[5:].str.replace('-', ' ').str.capitalize()

# Test
combined.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 785 entries, 0 to 784
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        785 non-null    int64  
 1   url               785 non-null    object 
 2   title             785 non-null    object 
 3   time              785 non-null    object 
 4   category          785 non-null    object 
 5   description       778 non-null    object 
 6   text              785 non-null    object 
 7   introduction      785 non-null    object 
 8   author            785 non-null    object 
 9   author_job_title  785 non-null    object 
 10  pageviews         785 non-null    int64  
 11  unique_pageviews  785 non-null    int64  
 12  avg_time          785 non-null    float64
 13  bounce_rate       785 non-null    float64
 14  exit%             785 non-null    float64
dtypes: float64(3), int64(3), object(9)
memory usage: 98.1+ KB


In [17]:
# Exploration of causes of NA values

# First, let's take a look at rows with no text data
combined[combined['text'].isna()]

Unnamed: 0.1,Unnamed: 0,url,title,time,category,description,text,introduction,author,author_job_title,pageviews,unique_pageviews,avg_time,bounce_rate,exit%


Each of these URLs redirect to another page, so we can safely drop them without losing any valuable data. 

In [18]:
combined.dropna(axis=0, subset=['text'], inplace=True)
combined.reset_index(inplace=True)
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 785 entries, 0 to 784
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   index             785 non-null    int64  
 1   Unnamed: 0        785 non-null    int64  
 2   url               785 non-null    object 
 3   title             785 non-null    object 
 4   time              785 non-null    object 
 5   category          785 non-null    object 
 6   description       778 non-null    object 
 7   text              785 non-null    object 
 8   introduction      785 non-null    object 
 9   author            785 non-null    object 
 10  author_job_title  785 non-null    object 
 11  pageviews         785 non-null    int64  
 12  unique_pageviews  785 non-null    int64  
 13  avg_time          785 non-null    float64
 14  bounce_rate       785 non-null    float64
 15  exit%             785 non-null    float64
dtypes: float64(3), int64(4), object(9)
memory us

In [19]:
# Next, let's look at rows with no category

combined[combined['category'].isna()]

Unnamed: 0.1,index,Unnamed: 0,url,title,time,category,description,text,introduction,author,author_job_title,pageviews,unique_pageviews,avg_time,bounce_rate,exit%


Some of these articles once again redirect to different pages, but others seem like just normal blog posts. Perhaps the categories for those can be scraped when we move fully to BeautifulSoup.

In [20]:
# Write the resulting merged table to file
mask = combined['text'].str.contains("\t")
combined.to_csv("../data/blogs_with_analytics.csv", sep='\t', index=False)

# Test
new_combined = pd.read_csv("../data/blogs_with_analytics.csv", sep='\t')
combined.compare(new_combined)


Unnamed: 0_level_0,time,time,avg_time,avg_time
Unnamed: 0_level_1,self,other,self,other
0,2022-09-16,2022-09-16,,
1,2022-09-13,2022-09-13,,
2,2022-09-12,2022-09-12,,
3,2022-09-01,2022-09-01,,
4,2022-08-26,2022-08-26,,
...,...,...,...,...
780,2010-05-10,2010-05-10,,
781,2010-03-19,2010-03-19,,
782,2010-03-12,2010-03-12,,
783,2010-02-26,2010-02-26,,
