In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
ads_df = pd.read_csv('../raw_data/raw.csv')
ads_df.head(2)

Unnamed: 0.1,Unnamed: 0,file_name,ad_id,ad_text,ad_landing_page,ad_targeting_custom_audience,ad_targeting_location,ad_targeting_interests,ad_targeting_excluded_connections,ad_targeting_age,ad_targeting_language,ad_targeting_placements,ad_targeting_people_who_match,ad_impressions,ad_clicks,ad_spend,ad_creation_date,ad_end_date
0,0,P(1)0003283.txt,1349,Close the borders until it's too late.\nLike t...,https://www.facebook.com/StopAllInvaders,,United States,,Exclude people who like Stop A.1.,35 - 65+,English (UK)or English (US),News Feed on desktop computers or News Feed on...,Interests: Immigration or Politics,1.0,0.0,0.66 RUB,04/13/16 07:48:33 AM PDT,
1,1,P(1)0002823.txt,986,repost @[1029022833877757:274:Blackslayingit]\n,https://www.facebook.com/blacktivists/,,United States,,,16 - 65+,English (UK) or English (US),News Feed on desktop computers or News Feed on...,"Interests: Pan-Africanism, African-American Ci...",10.496,1.823,200.00 RUB,04/21/17 07:42:45 AM PDT,04/22/17 11:00:00 AM PDT


In [123]:
cleaning_summary_format = {'columns_before_cleaning': len(ads_df)}

In [124]:
def remove_nulls(ads_df, column_name):
    null_indexes = np.where(pd.isnull(ads_df[column_name]))[0]
    ads_df = ads_df.drop(null_indexes)
    cleaning_summary_format['null_' + column_name + '_count'] = len(ads_df)
    return ads_df

In [125]:
# Remove rows with null ad_creation_date
ads_df = remove_nulls(ads_df, 'ad_creation_date')

# Remove rows with null ad_spend
ads_df = remove_nulls(ads_df, 'ad_spend')

# Remove rows with null ad_spend
ads_df = remove_nulls(ads_df, 'ad_targeting_age')

# Remove rows with null ad_impressions
ads_df = remove_nulls(ads_df, 'ad_impressions')

# Remove rows with null ad_clicks
ads_df = remove_nulls(ads_df, 'ad_clicks')

# Columns we will not be using
columns_to_remove = ['Unnamed: 0', 'ad_id', 'ad_text', 'ad_landing_page', 'ad_targeting_location', 'ad_targeting_custom_audience', 'ad_targeting_excluded_connections', 'ad_targeting_language', 'ad_targeting_placements']
ads_df = ads_df.drop(columns=columns_to_remove)

print('''Before cleaning our dataset had {columns_before_cleaning} columns.
After removing rows with null creation dates: {null_ad_creation_date_count} columns.
After removing rows with null ad spending: {null_ad_spend_count} columns.
After removing rows with null ad targeting age: {null_ad_targeting_age_count} columns.
After removing rows with null ad impressions: {null_ad_impressions_count} columns.
After removing rows with null ad clicks: {null_ad_clicks_count} columns.'''.format(**cleaning_summary_format))

Before cleaning our dataset had 3517 columns.
After removing rows with null creation dates: 3497 columns.
After removing rows with null ad spending: 3497 columns.
After removing rows with null ad targeting age: 3497 columns.
After removing rows with null ad impressions: 3497 columns.
After removing rows with null ad clicks: 3497 columns.


In [126]:
ads_df.count()

file_name                        3497
ad_targeting_interests            866
ad_targeting_age                 3497
ad_targeting_people_who_match    2290
ad_impressions                   3497
ad_clicks                        3497
ad_spend                         3497
ad_creation_date                 3497
ad_end_date                      2359
dtype: int64

### Cleaning ad_age

First let's check what are the values for the field:

In [127]:
ads_df.ad_targeting_age.value_counts().index

Index(['18 - 65+', '16 - 65+', '18 - 54', '18 - 45', '18 - 65+ Gender: Male',
       '18 - 51', '14 - 40', '13 - 65+', '18 - 50', '21 - 65+', '35 - 65+',
       '14 - 65+', '13 - 30 Gender: Male', '15 - 25', '15 - 25 Gender: Female',
       '18 - 59', '16 - 45', '19 - 30', '13 - 40', '13 - 45', '15 - 30',
       '16 - 53', '14 - 54', '25 - 65+', '13 - 35', '14 - 40 Gender: Female',
       '45 - 64 Gender: Male', '14 - 17 Gender: Female',
       '17 - 65+ Gender: Male', '16 - 25', '25 - 65+ Gender: Male', '24 - 45',
       '18 - 60', '18 65+', '15 - 40', '16 - 40', '15 - 54', '30 - 65+',
       '16 - 55', '13 - 50', '18 - 48', '13 - 44', '20 - 45', '45 - 65+',
       '18 - 65+ Gender: Female', '18 - 39', '13 - 60', '24 - 65+', '18 - 55',
       '16 - 60', '15 - 65+', '20 - 65+', '18 - 61', '18- 51', '16 - 54',
       '18 - 24', '15 - 26', '17 - 61', '18 - 43', '16 - 50', '13 - 42',
       '13 - 27', '40 - 65+', '18 - 53', '15 - 25 Gender: Male', '18 - 40',
       '14 - 36', '14 - 50', '

Let's simplify this bucketing by removing gender information. To do so let's crop the string at 8 characters.

In [128]:
ads_df.ad_targeting_age = ads_df.ad_targeting_age.apply(lambda s: s if len(s)<=8 else s[0:8])
count_table = ads_df.ad_targeting_age.value_counts().to_frame()
count_table.columns = ['Ad count']
count_table.head(5)

Unnamed: 0,Ad count
18 - 65+,2347
16 - 65+,356
18 - 54,123
18 - 45,71
18 - 51,48


As per this table, almost all ads targeted voting age facebook users (18+). Bucketing the ads by age groups will not result in significant. We drop the column.

In [129]:
ads_df = ads_df.drop(columns=['ad_targeting_age'])

### Cleaning ad_impressions and ad_clicks

Both these columns are numerical and do not contain None and or NaN values. From the Oxford and the multiple university studies, we know that ads without impressions or clicks where unlikely to have been showed to users.

* We will first be parsing these fields that sometimes use . or , to separate thousands.
* We will then remove 0 values.
* We will report the number of removed rows.

In [130]:
def format_string_to_integer(string):
    # Removing dots and commas and semicolons
    s = string.replace(',', '').replace('.', '').replace(';', '')
    
    # Removing typos betwee o, O (lower, upper letter o) and 0 (zero digit)
    s = s.replace('o', '0').replace('O', '0')
    
    # Accidental whitespace
    s = s.replace(' ', '')
    
    return int(s)

def string_to_integer(ads_df, column_name):
    ads_df[column_name] = ads_df[column_name].apply(format_string_to_integer)
    return ads_df

def remove_zeros(ads_df, column_name):
    ads_df = ads_df[ads_df[column_name] != 0]
    cleaning_summary_format['zeros_' + column_name + '_count'] = len(ads_df)
    return ads_df

In [131]:
# How many columns do we have before removing zeros
cleaning_summary_format['columns_before_zeros'] = len(ads_df)

# Conversion to integers
ads_df = string_to_integer(ads_df, 'ad_clicks')
ads_df = string_to_integer(ads_df, 'ad_impressions')

# Removing zeros values
ads_df = remove_zeros(ads_df, 'ad_impressions')
ads_df = remove_zeros(ads_df, 'ad_clicks')

# Reporting
print('''Before cleaning zeros our dataset had {columns_before_zeros} columns.
After removing rows with zero ad impressions: {zeros_ad_impressions_count} columns.
After removing rows with zero ad clicks: {zeros_ad_clicks_count} columns.'''.format(**cleaning_summary_format))

Before cleaning zeros our dataset had 3497 columns.
After removing rows with zero ad impressions: 2588 columns.
After removing rows with zero ad clicks: 2450 columns.


### Parsing creation date and end date

Creation date and end date are written in a complex format: 04/13/16 07:48:33 AM PDT. Our analysis only requires a subset of this information, the date. In this section we will extract the first 8 characters xx/xx/xx and convert them to a datetime object. Let's take a look at the entries:

In [132]:
ads_df['ad_creation_date']

1        04/21/17 07:42:45 AM PDT
2        04/13/17 04:40:03 AM PDT
5        05/29/17 12:29:07 AM PDT
6       08/18/16 08:31 :16 AM PDT
7        05/21/17 07:27:26 AM PDT
                  ...            
3511      04/13/17 0&14:34 AM PDT
3512    11 /18/16 08:22:22 AM PST
3513     01/24/17 06:08:31 AM PST
3515     02/07/17 04:30:54 AM PST
3516     02/22/17 06:49:48 AM PST
Name: ad_creation_date, Length: 2450, dtype: object

We find that sometimes the first few characters contain spaces. We write a regular expression for this and apply the removal of these white space part of a function. We also need to complete the year to be 4 characters for later date parsing.

In [133]:
date_regex = re.compile(r'(?P<date>\d\s*\d\s*\/\s*\d\s*\d\s*\/\s*\d\s*\d)')

def extract_date_from_string(string):
    matches = None
    date = None
    
    if not pd.isnull(string):
        matches = date_regex.search(string)
    else:
        # null value for string in pandas
        date = np.nan    

    if matches and matches.groupdict():
        group_dict = matches.groupdict()
        date = group_dict.get('date')
        if date:
            # Remove whitespace
            date = date.replace(' ', '')
            # We prefix '20' to the year to make 01/01/17 -> 01/01/2017
            date = date[:6] + '20' + date[6:]
    return date if date else 'parse_error'

We apply the function to every row and create a new column: 'ad_creation_date_parsed'

In [134]:
ads_df['ad_creation_date_parsed'] = ads_df.ad_creation_date.apply(extract_date_from_string)

We check how many dates could not be parsed:

In [135]:
(ads_df['ad_creation_date_parsed'] == 'parse_error').sum()

1

Since only one date could not be parsed we validate its value:

In [136]:
row = ads_df[ads_df['ad_creation_date_parsed'] == 'parse_error']
row

Unnamed: 0,file_name,ad_targeting_interests,ad_targeting_people_who_match,ad_impressions,ad_clicks,ad_spend,ad_creation_date,ad_end_date,ad_creation_date_parsed
1340,P(1)0005330.txt,"Veterans, United States Department of Veterans...",,15,2,50.06 RUB,02/21117 01:13:46 AM PST,,parse_error


In this case the date should be 02/21/2017, we will replace it manually.

In [137]:
ads_df.loc[row.index, 'ad_creation_date_parsed'] = '02/21/2017'
ads_df.loc[row.index]

Unnamed: 0,file_name,ad_targeting_interests,ad_targeting_people_who_match,ad_impressions,ad_clicks,ad_spend,ad_creation_date,ad_end_date,ad_creation_date_parsed
1340,P(1)0005330.txt,"Veterans, United States Department of Veterans...",,15,2,50.06 RUB,02/21117 01:13:46 AM PST,,02/21/2017


In [138]:
ads_df['ad_creation_date'] = ads_df['ad_creation_date_parsed']
ads_df = ads_df.drop(columns=['ad_creation_date_parsed'])
ads_df.head(5)

Unnamed: 0,file_name,ad_targeting_interests,ad_targeting_people_who_match,ad_impressions,ad_clicks,ad_spend,ad_creation_date,ad_end_date
1,P(1)0002823.txt,,"Interests: Pan-Africanism, African-American Ci...",10496,1823,200.00 RUB,04/21/2017,04/22/17 11:00:00 AM PDT
2,P(1)0002837.txt,,"Interests: Pan-Africanism, African-American Ci...",16305,1337,499.49 RUB,04/13/2017,04/14/17 12:00:00 PM PDT
5,P(1)0006304.txt,,"Interests: Martin Luther King, Jr., Stop Racis...",8210,1788,"1,570.03 RUB",05/29/2017,05/29/17 04:00:07 AM PDT
6,P(1)0000013.txt,,"Interests: Confederate Flag, Flags of the Conf...",12727,375,"3, 106.28 RUB",08/18/2016,
7,P(1)0006462.txt,,"Interests: Black nationalism, Pan-Africanism, ...",116540,15153,600.00 RUB,05/21/2017,05/22/17 07:27:26 AM PDT


We now execute the same steps for the end date.

In [139]:
ads_df['ad_end_date']

1        04/22/17 11:00:00 AM PDT
2        04/14/17 12:00:00 PM PDT
5        05/29/17 04:00:07 AM PDT
6                             NaN
7        05/22/17 07:27:26 AM PDT
                  ...            
3511     04/14/17 08-00-00 AM PDT
3512    11 /19/16 08:22:22 AM PST
3513     01/25/17 06:08:31 AM PST
3515     02/08/17 04:30:54 AM PST
3516     02/23/17 06:49:48 AM PST
Name: ad_end_date, Length: 2450, dtype: object

In [140]:
ads_df['ad_end_date_parsed'] = ads_df.ad_end_date.apply(extract_date_from_string)

We check how many dates could not be parsed:

In [141]:
(ads_df['ad_end_date_parsed'] == 'parse_error').sum()

0

In [142]:
ads_df['ad_end_date'] = ads_df['ad_end_date_parsed']
ads_df = ads_df.drop(columns=['ad_end_date_parsed'])
ads_df.head(5)

Unnamed: 0,file_name,ad_targeting_interests,ad_targeting_people_who_match,ad_impressions,ad_clicks,ad_spend,ad_creation_date,ad_end_date
1,P(1)0002823.txt,,"Interests: Pan-Africanism, African-American Ci...",10496,1823,200.00 RUB,04/21/2017,04/22/2017
2,P(1)0002837.txt,,"Interests: Pan-Africanism, African-American Ci...",16305,1337,499.49 RUB,04/13/2017,04/14/2017
5,P(1)0006304.txt,,"Interests: Martin Luther King, Jr., Stop Racis...",8210,1788,"1,570.03 RUB",05/29/2017,05/29/2017
6,P(1)0000013.txt,,"Interests: Confederate Flag, Flags of the Conf...",12727,375,"3, 106.28 RUB",08/18/2016,
7,P(1)0006462.txt,,"Interests: Black nationalism, Pan-Africanism, ...",116540,15153,600.00 RUB,05/21/2017,05/22/2017


### Parsing ad_spend

Sometimes the ad_spend field contains spaces, dots instead of commas to seperate thousands and the 'RUB' currency shorthand. We use a regular expression to extract the amount of the ad_spend field. We then convert the string to a float. 

We first write a function to do the parsing.

In [143]:
ads_df['ad_spend']

1          200.00 RUB
2          499.49 RUB
5        1,570.03 RUB
6       3, 106.28 RUB
7          600.00 RUB
            ...      
3511       199.96 RUB
3512       500.00 RUB
3513       401.61 RUB
3515       300.00 RUB
3516       500.00 RUB
Name: ad_spend, Length: 2450, dtype: object

In [144]:
amount_regex = re.compile(r'(?P<amount>([0-9]{1,3}(\.|,)?)+(\.|,)?[0-9]{2})')

def extract_amount_from_string(string):
    matches = None
    amount = None
    
    if not pd.isnull(string) and string != 'None':
        matches = amount_regex.search(string)
    else:
        # null value for string in pandas
        amount = np.nan

    if matches and matches.groupdict():
        group_dict = matches.groupdict()
        amount = group_dict.get('amount')
        if amount:
            # Remove whitespace
            amount = amount.replace(' ', '')
            
            # Remove dots and commas
            amount = amount.replace('.', '').replace(',', '')
            
            # Add a dot two digits form the end
            amount = amount[:-2] + '.' + amount[-2:]            
    return amount if amount else 'parse_error'

We run the function over our dataset and output the number of parsing errors we've encountered.

In [145]:
ads_df['ad_spend_parsed'] = ads_df.ad_spend.apply(extract_amount_from_string)
(ads_df['ad_spend_parsed'] == 'parse_error').sum()

0

We validate nan values and remove them from the dataset.

In [146]:
print('There are a total of ' + str(pd.isnull(ads_df['ad_spend_parsed']).sum()) + ' nan values.')

There are a total of 8 nan values.


In [147]:
ads_df[pd.isnull(ads_df['ad_spend_parsed'])]

Unnamed: 0,file_name,ad_targeting_interests,ad_targeting_people_who_match,ad_impressions,ad_clicks,ad_spend,ad_creation_date,ad_end_date,ad_spend_parsed
174,P(1)0006107.txt,,Interests: Music or Rock music And Must Also M...,281,1,,05/10/2016,,
604,P(1)0006115.txt,,Interests: Music or Rock music And Must Also M...,327,1,,05/10/2016,,
934,P(1)0006011.txt,,Behaviors: Facebook access (browser): Chrome A...,404,1,,05/12/2016,,
1137,P(1)0003195.txt,,Interests: Immigration or Conservatism,100,2,,01/14/2016,,
1887,P(1)0003175.txt,,,277,5,,03/10/2016,,
2772,P(1)0005995.txt,,Behaviors: Facebook access (browser): Chrome A...,277,1,,05/12/2016,,
3013,P(1)0006045.txt,,Behaviors: Facebook access (browser): Chrome A...,251,1,,05/12/2016,,
3313,P(1)0001473.txt,,"Interests: BlackNews.com, Black (Color) or Huf...",52,1,,01/14/2016,,


In [148]:
ads_df = ads_df[~pd.isnull(ads_df['ad_spend_parsed'])]
ads_df['ad_spend'] = ads_df['ad_spend_parsed']
ads_df = ads_df.drop(columns=['ad_spend_parsed'])

We transform the ad_spend field into a float.

In [149]:
ads_df['ad_spend'] = ads_df['ad_spend'].astype(float)

We validate that all values are positive and remove other values after validation.

In [150]:
print('There are ' + str((ads_df['ad_spend'] > 0).sum()) + ' positive values and a total of ' + str(len(ads_df)) + ' entries.')

There are 2440 positive values and a total of 2442 entries.


In [151]:
ads_df[ads_df['ad_spend'] <= 0]

Unnamed: 0,file_name,ad_targeting_interests,ad_targeting_people_who_match,ad_impressions,ad_clicks,ad_spend,ad_creation_date,ad_end_date
843,P(1)0001447.txt,,"Interests: Black Tea Patriots, Black Knowledge...",39,1,0.0,01/14/2016,
2109,P(1)0001588.txt,,Interests: National Museum of American History...,47,1,0.0,01/14/2016,


We remove the two entries with values equal to zero.

In [152]:
ads_df = ads_df[ads_df['ad_spend'] > 0]
len(ads_df)

2440

### Parsing ad_targeting_interests & ad_targeting_people_who_match

The ad_targeting_interests column is split between its own column the ad_targeting_people_who_match column. To make treatment of this column simpler, our first step will be to extract the 'interest' portion of ad_targeting_people_who_match. We will then parse the ad_targeting_interests column.

In [153]:
ads_df.count()

file_name                        2440
ad_targeting_interests            637
ad_targeting_people_who_match    1618
ad_impressions                   2440
ad_clicks                        2440
ad_spend                         2440
ad_creation_date                 2440
ad_end_date                      1863
dtype: int64

First we take a look at ad_targeting_people_who_match for entries with and without 'Interests'. We will investigate those without 'Interests' first.

In [154]:
count_null = 0
count_interests = 0
count_other = 0
for s in ads_df['ad_targeting_people_who_match']:
    if pd.isnull(s):
        count_null += 1
    elif 'Interests' in s:
        count_interests += 1
    else:
        count_other +=1
        print(s)
        
print('Null:' + str(count_null) +
      ' Interests: ' + str(count_interests) +
      ' Other: ' + str(count_other) +
      ' Total: ' + str(count_null + count_interests + count_other))

People who like Don't Shoot, Friends of connections: Friends of people who are connected to Don't Shoot
People who like Being Patriotic, Friends of connections: Friends of people who are connected to Being Patriotic
People who like United Muslims of America, Friends of connections: Friends of people who are connected to United Muslims of America
People who like Black Matters, Friends of connections: Friends of people who are connected to Black Matters
People who like Black Matters. Friends of connections: Friends of people who are connected to Black Matters
Politics: Likely to engage with political content (liberal)
People who like LGBT United, Friends of connections: Friends of people who are connected to LGBT United
People who like Black Matters, Friends of connections: Friends of people who are connected to Black Matters
People who like LGBT United, Friends of connections: Friends of people who are connected to LGBT United
Behaviors: African American (US)
People who like LGBT United

From this print out, we see that we can grab the value of the like correctly by taking the string after 'Friends of people who are connected to'. We will do so while croping the strings to interests.

For the rows with 'Interests', after looking at a few of the raw_files, we see that ad_targeting_people_who_match sometimes contains other fields. To identify rows which had additonal fields we looked for the number of ':' characters, we then identified patterns in those strings that didn't match the interests field. These patterns are used in the crop_to_interest function below.

In [155]:
def crop_everything_after(string, contains):
    return string[:string.index(contains)] if contains in string else string


def treat_string_with_friends(string):
    friends_string = 'Friends of people who are connected to '
    start = string.index(friends_string)
    return string[start+len(friends_string):]

    
def treat_string_with_interest(string):
    # Crop everything before 'Interests'
    string = string[string.index('Interests'):]

    # Strings identified by visual inspections of entries
    crop_after = [
        'And Must Also Match',
        'School:',
        'Behaviors:',
        'expansion:',
        'Job title:',
        'Multicultural Affinity:',
        'Politics:',
        'Employers:',
        'Field of study:'
    ]

    for to_crop in crop_after:
        string = crop_everything_after(string, to_crop)

    # Finally this substring had a typo
    if 'Stop Racism!:.' in string:
        string = string.replace('Stop Racism!:.', 'Stop Racism!!,')
    
    return string

def crop_to_interest(string):
    if not pd.isnull(string):
        
        if 'Interests' in string:
            string = treat_string_with_interest(string)
        elif 'Friends of people who are connected to ' in string:
            string = treat_string_with_friends(string)
        else:
            # pd.isnull value for strings
            string = np.nan

    return string

In [156]:
ads_df['ad_targeting_people_who_match'] = ads_df['ad_targeting_people_who_match'].apply(crop_to_interest)

During this operation we lost a few rows that could not be parsed as it did not contain interests.

In [157]:
print(str(pd.isnull(ads_df['ad_targeting_people_who_match']).sum() - count_null) + ' rows where lost.')

29 rows where lost.


The last cleaning step for this field is to remove the 'Interests' keyword which is sometimes followed by a colon. We use a regular expression to replace this string.

In [168]:
interests_regex = re.compile(r'Interests\s*:?')

def remove_interests_marker(string):
    if not pd.isnull(string):
        string = interests_regex.sub('', string)
    return string

In [169]:
ads_df['ad_targeting_people_who_match'] = ads_df['ad_targeting_people_who_match'].apply(remove_interests_marker)

In [170]:
ads_df.head(3)

Unnamed: 0,file_name,ad_targeting_interests,ad_targeting_people_who_match,ad_impressions,ad_clicks,ad_spend,ad_creation_date,ad_end_date
1,P(1)0002823.txt,,"Pan-Africanism, African-American Civil Rights...",10496,1823,200.0,04/21/2017,04/22/2017
2,P(1)0002837.txt,,"Pan-Africanism, African-American Civil Rights...",16305,1337,499.49,04/13/2017,04/14/2017
5,P(1)0006304.txt,,"Martin Luther King, Jr., Stop Racism!!, Afric...",8210,1788,1570.03,05/29/2017,05/29/2017


We now do the same exercise with ad_targeting_interests. We first identify non-null rows that may contain an extra field. We do so by looking for the ':' character and printing out these rows.

In [171]:
non_null_interests = ads_df[~pd.isnull(ads_df['ad_targeting_interests'])]['ad_targeting_interests']

for row_with_colon in non_null_interests[non_null_interests.str.contains(':')]:
    print(row_with_colon)

BlackNews.com or HuffPost Black Voices Behaviors: African American (US)
BlackNews.com or HuffPost Black Voices Behaviors: African American (US)
Humanitarianism, Human rights or Humanitarian aid Behaviors: African American (US)
Black Power Behaviors: Multicultural Affinity: African American (US)
Human rights or Malcolm X Behaviors: African American (US)
BlackNews.com or HuffPost Black Voices Behaviors: African American (US)
BlackNews.com or HuffPost Black Voices Behaviors: African American (US)
BlackNews.com or HuffPost Black Voices Behaviors: African American (US)
BlackNews.com or HuffPost Black Voices Behaviors: African American (US)
BlackNews.com or HuffPost Black Voices Behaviors: African American (US)
BlackNews.com or HuffPost Black Voices Behaviors: African American (US)
Humanitarianism, Human rights or Humanitarian aid Behaviors: African American (US)
Muslims Are Not Terrorists. Islamism or Muslim Brotherhood Connections: People who like United Muslims of America
History Politics

Most of these rows seem to contain an additional field "Behaviors" we will remove it from the rows.

In [172]:
def treat_interest(string):
    if not pd.isnull(string):
        # Strings identified by visual inspections of entries
        crop_after = [
            'And Must Also Match',
            'School:',
            'Behaviors:',
            'expansion:',
            'Job title:',
            'Multicultural Affinity:',
            'Politics:',
            'Employers:',
            'Field of study:',
            'Connections:',
            'Home Composition:'
        ]

        for to_crop in crop_after:
            string = crop_everything_after(string, to_crop)
    else:
        # pd.isnull value for strings
        string = np.nan
    
    return string

ads_df['ad_targeting_interests'] = ads_df['ad_targeting_interests'].apply(treat_interest)

In [173]:
non_null_interests = ads_df[~pd.isnull(ads_df['ad_targeting_interests'])]['ad_targeting_interests']
print('After treatment, there are ' + str(non_null_interests[non_null_interests.str.contains(':')].count()) +' rows with more than one field.')

After treatment, there are 0 rows with more than one field.


We can now merge the two columns into one. First let's verify that there are no rows where both columns are non-nan or both nan.

In [174]:
def interests_both_nan(row):
    return (pd.isnull(row.ad_targeting_interests) and pd.isnull(row.ad_targeting_people_who_match))

def interests_both_non_nan(row):
    return (not pd.isnull(row.ad_targeting_interests) and not pd.isnull(row.ad_targeting_people_who_match))
    
# How many rows have both columns as nan        
both_nan = ads_df.apply(interests_both_nan, axis=1).sum()

# How many rows have both columns populated
non_nan_count = ads_df.apply(interests_both_non_nan, axis=1).sum()

print('We have a total of ' + str(both_nan) + ' rows with both columns nan and a total of ' + str(non_nan_count) + ' rows which have both values set.')

We have a total of 214 rows with both columns nan and a total of 0 rows which have both values set.


Once again we drop rows, this time as they do not contain interests information. We will merge the other rows by replacing the values of ad_targeting_interests with ad_targeting_people_who_match.

In [175]:
def merge_interests(row):
    return row.ad_targeting_interests if not pd.isnull(row.ad_targeting_interests) else row.ad_targeting_people_who_match

# Merge interests
ads_df['ad_targeting_interests'] = ads_df.apply(merge_interests, axis=1)

# Drop 'ad_targeting_people_who_match'
ads_df = ads_df.drop(columns=['ad_targeting_people_who_match'])

# Drop null columns
ads_df = ads_df[(~pd.isnull(ads_df['ad_targeting_interests']))]

In [176]:
ads_df.head(3)

Unnamed: 0,file_name,ad_targeting_interests,ad_impressions,ad_clicks,ad_spend,ad_creation_date,ad_end_date
1,P(1)0002823.txt,"Pan-Africanism, African-American Civil Rights...",10496,1823,200.0,04/21/2017,04/22/2017
2,P(1)0002837.txt,"Pan-Africanism, African-American Civil Rights...",16305,1337,499.49,04/13/2017,04/14/2017
5,P(1)0006304.txt,"Martin Luther King, Jr., Stop Racism!!, Afric...",8210,1788,1570.03,05/29/2017,05/29/2017


### Last cleanup & writing to file

In this final section, we will parse the date columns into date objects so that pandas can read them into date objects directly from the csv file.

In [190]:
ads_df.ad_creation_date = ads_df.ad_creation_date.apply(lambda date_string : pd.to_datetime(date_string, format='%m/%d/%Y'))

In [191]:
ads_df.ad_end_date = ads_df.ad_end_date.apply(lambda date_string : pd.to_datetime(date_string, format='%m/%d/%Y'))

In [192]:
ads_df.head(3)

Unnamed: 0,file_name,ad_targeting_interests,ad_impressions,ad_clicks,ad_spend,ad_creation_date,ad_end_date
1,P(1)0002823.txt,"Pan-Africanism, African-American Civil Rights...",10496,1823,200.0,2017-04-21,2017-04-22
2,P(1)0002837.txt,"Pan-Africanism, African-American Civil Rights...",16305,1337,499.49,2017-04-13,2017-04-14
5,P(1)0006304.txt,"Martin Luther King, Jr., Stop Racism!!, Afric...",8210,1788,1570.03,2017-05-29,2017-05-29


In [193]:
ads_df.to_csv('../clean_data/clean_data.csv', index=None, header=True)

## Data cleanup summary

We have lost rows for various reasons during the cleaning:

*
*
8

Finally, we have decided to keep the following fields:

| Field name             | Type     | Description                         |
|------------------------|----------|-------------------------------------|
| ad_targeting_interests | string   | Interests used to target users      |
| ad_impressions         | int      | Number of users who saw the ads     |
| ad_clicks              | int      | Number of times the ads was clicked |
| ad_spend               | float    | Money spent on the ad in RUB        |
| ad_creation_date       | datetime | Creation date of the ad             |
| ad_end_date            | datetime | Date at which the ad stopped        |