In [1]:
import csv
import re
import nltk
from nltk.corpus import stopwords
from collections import Counter

In [2]:
# load the data from Google Ad Words into a list of dicts
with open('bbqguys_31oct2015_31oct2016.csv') as f:
    ads = list(csv.DictReader(f))

In [3]:
# extract the relevant colummns into a new dict
data = []
for line in ads:
    row = {'Ad ID' : line['Ad ID'], 
           'Ad group' : line['Ad group'],
           'Ad state' : line['Ad state'],
           'Bounce rate' : line['Bounce rate'], 
           'CTR' : line['CTR'],
           'Conv. rate' : line['Conv. rate'],
           'Description' : line['Description'],
           'Description line 1' : line['Description line 1'],
           'Description line 2' : line['Description line 2'],
           }
    data.append(row)

In [4]:
# convert the percent values to floats
fields = ["Bounce rate", "CTR", "Conv. rate"]
for line in data:
    for item in fields:
        line[item] = float(line[item].replace("%", ""))

In [5]:
# combine all descriptions into one
for line in data:
    line['Full description'] = line['Description'] + " " + line['Description line 1'] + " " + line['Description line 2']
    line['Description Length'] = len(line['Full description'])

In [6]:
data[0]

{'Ad ID': '85878522126',
 'Ad group': 'Blaze 25 Inch Grills',
 'Ad state': 'paused',
 'Bounce rate': 0.0,
 'CTR': 0.0,
 'Conv. rate': 0.0,
 'Description': '',
 'Description Length': 64,
 'Description line 1': 'Save 20% On All Blaze Grills.',
 'Description line 2': 'Free & Fast Shipping. Shop Today!',
 'Full description': ' Save 20% On All Blaze Grills. Free & Fast Shipping. Shop Today!'}

In [7]:
# remove any non-letter from the full description and replace them with spaces, and convert words into lowercase and split
for line in data:
    letters = re.sub("[^a-zA-Z]", " ", line['Full description'])
    line['Description words'] = set(letters.lower().split(' '))

In [8]:
data[0]

{'Ad ID': '85878522126',
 'Ad group': 'Blaze 25 Inch Grills',
 'Ad state': 'paused',
 'Bounce rate': 0.0,
 'CTR': 0.0,
 'Conv. rate': 0.0,
 'Description': '',
 'Description Length': 64,
 'Description line 1': 'Save 20% On All Blaze Grills.',
 'Description line 2': 'Free & Fast Shipping. Shop Today!',
 'Description words': {'',
  'all',
  'blaze',
  'fast',
  'free',
  'grills',
  'on',
  'save',
  'shipping',
  'shop',
  'today'},
 'Full description': ' Save 20% On All Blaze Grills. Free & Fast Shipping. Shop Today!'}

In [9]:
# extract non-stopwords from the words in the description
stopwrds = set(stopwords.words("english"))
for line in data:
    line['Clean words'] = line['Description words'].difference(stopwrds)

In [10]:
data[0]

{'Ad ID': '85878522126',
 'Ad group': 'Blaze 25 Inch Grills',
 'Ad state': 'paused',
 'Bounce rate': 0.0,
 'CTR': 0.0,
 'Clean words': {'',
  'blaze',
  'fast',
  'free',
  'grills',
  'save',
  'shipping',
  'shop',
  'today'},
 'Conv. rate': 0.0,
 'Description': '',
 'Description Length': 64,
 'Description line 1': 'Save 20% On All Blaze Grills.',
 'Description line 2': 'Free & Fast Shipping. Shop Today!',
 'Description words': {'',
  'all',
  'blaze',
  'fast',
  'free',
  'grills',
  'on',
  'save',
  'shipping',
  'shop',
  'today'},
 'Full description': ' Save 20% On All Blaze Grills. Free & Fast Shipping. Shop Today!'}

In [11]:
# get a list of all clean words in all ads
all_words = []
for line in data:
    for word in line['Clean words']:
        if word != '' and len(word) > 3:
            all_words.append(word)

In [12]:
# get a count of the most common words used
top25_words = Counter(all_words).most_common(25)
top25_words = set(line[0] for line in top25_words)
top25_words

{'accessories',
 'deals',
 'expert',
 'fast',
 'free',
 'grill',
 'grilling',
 'grills',
 'kitchens',
 'lifetime',
 'lynx',
 'online',
 'orders',
 'outdoor',
 'prices',
 'reviews',
 'save',
 'selection',
 'ship',
 'shipping',
 'shop',
 'today',
 'videos',
 'warranty',
 'weber'}

In [13]:
# create an indicator variable for each word to tell if it's present in the clean ad words
for line in data:
    for word in top25_words:
        if word in line['Clean words']:
            line[word] = 1
        else:
            line[word] = 0

In [14]:
with open('AdWords_Clean.csv', 'w') as f:
    writer = csv.DictWriter(f, data[0].keys())
    writer.writeheader()
    writer.writerows(data)