In [None]:
import numpy as np
import pandas as pd
from datetime import datetime

In [None]:
# Read blog data and analytics data
blogs = pd.read_csv('../data/blog_text.csv', 
    sep=',', 
    engine='python', 
    parse_dates=['time'], 
    date_parser=lambda col: pd.to_datetime(col, utc=True)
)
analytics = pd.read_csv('../data/google_analytics.csv')

# Clean up category titles
blogs['category'] = blogs['category'].str.replace('&amp;', 'and')

# Convert time strings to datetime format
blogs['time'] = blogs['time'].dt.date

blogs['url'] = 'blog' + blogs['url'].str.split('blog', expand=True)[1].str.rstrip('/')

# Combine the two tables
combined = pd.merge(blogs, analytics, how='inner', on='url')

# Drop entires that do not have urls OR pageviews
combined = combined.dropna(subset=['url', 'pageviews'])

# Infer titles from urls for entries that are missing a title
#   1. Select entries that do not have a title
#   2. For each selected entry take the URL
#   3. Drop the first 5 character ('blog/')
#   4. Replace dashes ('-') with whitespaces
#   5. Use the altered URL as a new title for the entry
mask = combined['title'].isna()
combined.loc[mask, 'title'] = combined['url'][mask].str[5:].str.replace('-', ' ').str.capitalize()

# Test
combined.info()


In [None]:
# Exploration of causes of NA values

# First, let's take a look at rows with no text data
combined[combined['text'].isna()]

Each of these URLs redirect to another page, so we can safely drop them without losing any valuable data. 

In [None]:
combined.dropna(axis=0, subset=['text'], inplace=True)
combined.reset_index(inplace=True)
combined.info()

In [None]:
# Next, let's look at rows with no category

combined[combined['category'].isna()]

Some of these articles once again redirect to different pages, but others seem like just normal blog posts. Perhaps the categories for those can be scraped when we move fully to BeautifulSoup.

In [None]:
# Write the resulting merged table to file
mask = combined['text'].str.contains("\t")
combined.to_csv("../data/blogs_with_analytics.csv", sep='\t', index=False)

# Test
new_combined = pd.read_csv("../data/blogs_with_analytics.csv", sep='\t')
combined.compare(new_combined)
