# Import data 

In [1]:
# Imports
import numpy as np
import pandas as pd

In [2]:
# Load the excel file(s) as GA_data.
GA_data = pd.read_excel("Change 2022_GA writeback_091122.xlsx", sheet_name=2)

# View the DataFrame.
GA_data.head()

Unnamed: 0,"City, Country",Audience,Campaign,Date,Platform,Ad Format,Creative - Family,Creative - Version,Total Sessions,Days away from max date,Latest report?
0,"Ashburn, United States",General Targetting,FY23_change_digital_phase3,2022-08-08,Trade Media,Inside_mortgage_finance_newsletter,CloserTwins,RTB,1.0,84.0,0
1,"Mebane, United States",4,FY23_broker_campaign,2022-06-16,User ID Display,,UnfairAdvantage,OnePage,1.0,137.0,0
2,"Chicago, United States",2,FY23_broker_campaign,2022-06-05,Domain Display,Single image,UnfairAdvantage,NoDTI,2.0,148.0,0
3,"South Jordan, United States",1,FY23_broker_campaign,2022-09-02,User ID Display,,CloserTwins,MoreLoansAll,1.0,59.0,0
4,"Potsdam, United States",4,FY23_broker_campaign,2022-10-30,Domain Display,,UnfairAdvantage,1099.0,1.0,1.0,1


# Understaning data

Information about the data 

Country: country of advertisement

Audience: audience type as specified in the meta data see here for more info:

- Audience 1: Registered Loan Officers from Registered Brokerage, active in the last 120 days.
- Audience 2: Registered Loan Officers from Registered Brokerage, not active in the last 120 days.
- Audience 3: Registered Loan Officers from Registered Brokerage, who have never registered a loan.
- Audience 4: Non-Registered Loan Officers from Registered Brokerages.
- Audience 5: Retargeted audience. Non-Registered Loan Officers who recently visited the website (last 7 days).
- Audience 6: General Targeting. Brokers on the targeting list who are not registered and not from registered brokers.

Campaign: campaign type 

Date: Date

Platform: Platform used

Ad format: format of ad 

Creative - Family: Ad design 

Creative - Version: Ad content 

Total sessions - Number of times seen

Days away from max date - how early in the campaign was this seen? 

Latest report - Feedback?



# Cleaning

## Renamming columns

In [3]:
# Rename the column headers.
GA_data_rename = GA_data.rename(
    columns={
        "City, Country": "Location",
        "Ad Format": "Ad_Format",
        "Creative - Family": "Creative_Family",
        "Creative - Version": "Creative_Version",
        "Total Sessions": "Total_Sessions",
        "Days away from max date": "Days_Max_Date",
        "Latest report?": "Latest_Report"})

# Sense check
GA_data_rename.head()

Unnamed: 0,Location,Audience,Campaign,Date,Platform,Ad_Format,Creative_Family,Creative_Version,Total_Sessions,Days_Max_Date,Latest_Report
0,"Ashburn, United States",General Targetting,FY23_change_digital_phase3,2022-08-08,Trade Media,Inside_mortgage_finance_newsletter,CloserTwins,RTB,1.0,84.0,0
1,"Mebane, United States",4,FY23_broker_campaign,2022-06-16,User ID Display,,UnfairAdvantage,OnePage,1.0,137.0,0
2,"Chicago, United States",2,FY23_broker_campaign,2022-06-05,Domain Display,Single image,UnfairAdvantage,NoDTI,2.0,148.0,0
3,"South Jordan, United States",1,FY23_broker_campaign,2022-09-02,User ID Display,,CloserTwins,MoreLoansAll,1.0,59.0,0
4,"Potsdam, United States",4,FY23_broker_campaign,2022-10-30,Domain Display,,UnfairAdvantage,1099.0,1.0,1.0,1


## Sense Check Data

In [4]:
# Explore the data.
GA_data_rename.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47529 entries, 0 to 47528
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Location          47529 non-null  object        
 1   Audience          47140 non-null  object        
 2   Campaign          47529 non-null  object        
 3   Date              47529 non-null  datetime64[ns]
 4   Platform          47528 non-null  object        
 5   Ad_Format         29519 non-null  object        
 6   Creative_Family   47529 non-null  object        
 7   Creative_Version  47529 non-null  object        
 8   Total_Sessions    47529 non-null  float64       
 9   Days_Max_Date     47529 non-null  float64       
 10  Latest_Report     47529 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(1), object(7)
memory usage: 4.0+ MB


In [5]:
# Descriptive statistics.
GA_data_rename.describe()

Unnamed: 0,Total_Sessions,Days_Max_Date,Latest_Report
count,47529.0,47529.0,47529.0
mean,1.259968,76.467104,0.186434
std,0.969314,60.046171,0.38946
min,1.0,0.0,0.0
25%,1.0,20.0,0.0
50%,1.0,62.0,0.0
75%,1.0,129.0,0.0
max,21.0,213.0,1.0


## Replace the missing values with NaN

In [6]:
# Any missing values?
GA_data_rename.isna().sum()

Location                0
Audience              389
Campaign                0
Date                    0
Platform                1
Ad_Format           18010
Creative_Family         0
Creative_Version        0
Total_Sessions          0
Days_Max_Date           0
Latest_Report           0
dtype: int64

In [7]:
# Replacing blank values with NaN
GA_data_replace_na = GA_data_rename.replace(r'^\s*$', np.nan, regex=True)
GA_data_replace_na.head()

Unnamed: 0,Location,Audience,Campaign,Date,Platform,Ad_Format,Creative_Family,Creative_Version,Total_Sessions,Days_Max_Date,Latest_Report
0,"Ashburn, United States",General Targetting,FY23_change_digital_phase3,2022-08-08,Trade Media,Inside_mortgage_finance_newsletter,CloserTwins,RTB,1.0,84.0,0
1,"Mebane, United States",4,FY23_broker_campaign,2022-06-16,User ID Display,,UnfairAdvantage,OnePage,1.0,137.0,0
2,"Chicago, United States",2,FY23_broker_campaign,2022-06-05,Domain Display,Single image,UnfairAdvantage,NoDTI,2.0,148.0,0
3,"South Jordan, United States",1,FY23_broker_campaign,2022-09-02,User ID Display,,CloserTwins,MoreLoansAll,1.0,59.0,0
4,"Potsdam, United States",4,FY23_broker_campaign,2022-10-30,Domain Display,,UnfairAdvantage,1099.0,1.0,1.0,1


I wanted to do this because I think even though some values are blank in a certain row, other in other in that row could still be useful in the analysis.

## Correting Format

In [8]:
# Change the date to date
GA_data_replace_na['Date'] = pd.to_datetime(GA_data_replace_na['Date'])

In [9]:
# Necessary to change these columns to integers
GA_data_replace_na['Total_Sessions'] = GA_data_replace_na['Total_Sessions'].fillna(0)
GA_data_replace_na['Days_Max_Date'] = GA_data_replace_na['Days_Max_Date'].fillna(0)
GA_data_replace_na['Latest_Report'] = GA_data_replace_na['Latest_Report'].fillna(0)

# Changing type to integer
GA_data_replace_na['Total_Sessions'] = GA_data_replace_na['Total_Sessions'].astype('int')
GA_data_replace_na['Days_Max_Date'] = GA_data_replace_na['Days_Max_Date'].astype('int')
GA_data_replace_na['Latest_Report'] = GA_data_replace_na['Latest_Report'].astype('int')

In [10]:
# Ensure changes have been completed
GA_data_replace_na.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47529 entries, 0 to 47528
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Location          47529 non-null  object        
 1   Audience          47140 non-null  object        
 2   Campaign          47529 non-null  object        
 3   Date              47529 non-null  datetime64[ns]
 4   Platform          47528 non-null  object        
 5   Ad_Format         29519 non-null  object        
 6   Creative_Family   47529 non-null  object        
 7   Creative_Version  47529 non-null  object        
 8   Total_Sessions    47529 non-null  int32         
 9   Days_Max_Date     47529 non-null  int32         
 10  Latest_Report     47529 non-null  int32         
dtypes: datetime64[ns](1), int32(3), object(7)
memory usage: 3.4+ MB


## Remove United States from Location Column

In [11]:
# Remove United states from the Location column
GA_data_replace_na['Location'] = GA_data_replace_na['Location'].str.replace('United States', '')
GA_data_replace_na['Location'] = GA_data_replace_na['Location'].str.replace(',', '')

GA_data_replace_na


Unnamed: 0,Location,Audience,Campaign,Date,Platform,Ad_Format,Creative_Family,Creative_Version,Total_Sessions,Days_Max_Date,Latest_Report
0,Ashburn,General Targetting,FY23_change_digital_phase3,2022-08-08,Trade Media,Inside_mortgage_finance_newsletter,CloserTwins,RTB,1,84,0
1,Mebane,4,FY23_broker_campaign,2022-06-16,User ID Display,,UnfairAdvantage,OnePage,1,137,0
2,Chicago,2,FY23_broker_campaign,2022-06-05,Domain Display,Single image,UnfairAdvantage,NoDTI,2,148,0
3,South Jordan,1,FY23_broker_campaign,2022-09-02,User ID Display,,CloserTwins,MoreLoansAll,1,59,0
4,Potsdam,4,FY23_broker_campaign,2022-10-30,Domain Display,,UnfairAdvantage,1099.0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
47524,Pearland,4,FY23_broker_campaign,2022-10-18,Domain Display,,UnfairAdvantage,OnePage,1,13,1
47525,Springfield,4,FY23_broker_campaign,2022-06-10,Domain Display,,UnfairAdvantage,OnePage,1,143,0
47526,Boston,5,FY23_broker_campaign,2022-10-21,Domain Display,Single image,UnfairAdvantage,1099.0,1,10,1
47527,Cleveland,5,FY22_broker_campaign_ph2,2022-05-20,LinkedIn,Single image,UnfairAdvantage,1page,1,164,0


## Change general setting to 6 as specified in the meta data set

In [12]:
# Change general setting to 6 
GA_data_replace_na['Audience'] = GA_data_replace_na['Audience'].replace(['General Targetting'], '6')
GA_data_replace_na

Unnamed: 0,Location,Audience,Campaign,Date,Platform,Ad_Format,Creative_Family,Creative_Version,Total_Sessions,Days_Max_Date,Latest_Report
0,Ashburn,6,FY23_change_digital_phase3,2022-08-08,Trade Media,Inside_mortgage_finance_newsletter,CloserTwins,RTB,1,84,0
1,Mebane,4,FY23_broker_campaign,2022-06-16,User ID Display,,UnfairAdvantage,OnePage,1,137,0
2,Chicago,2,FY23_broker_campaign,2022-06-05,Domain Display,Single image,UnfairAdvantage,NoDTI,2,148,0
3,South Jordan,1,FY23_broker_campaign,2022-09-02,User ID Display,,CloserTwins,MoreLoansAll,1,59,0
4,Potsdam,4,FY23_broker_campaign,2022-10-30,Domain Display,,UnfairAdvantage,1099.0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
47524,Pearland,4,FY23_broker_campaign,2022-10-18,Domain Display,,UnfairAdvantage,OnePage,1,13,1
47525,Springfield,4,FY23_broker_campaign,2022-06-10,Domain Display,,UnfairAdvantage,OnePage,1,143,0
47526,Boston,5,FY23_broker_campaign,2022-10-21,Domain Display,Single image,UnfairAdvantage,1099.0,1,10,1
47527,Cleveland,5,FY22_broker_campaign_ph2,2022-05-20,LinkedIn,Single image,UnfairAdvantage,1page,1,164,0


 ## Checking each column for unique values

In [13]:
# Create function to check unique values in 
def col_list(df, para):
    print("List of values in <", para ,"> : \n",df[para].unique(),"\n")
    return

First checking the audience types

In [14]:
# Checking unique values in audience column 
col_list(GA_data_replace_na, 'Audience')

List of values in < Audience > : 
 ['6' '4' '2' '1' '5' '3' nan 'fivedisparate'] 



In [15]:
# I had to remove this row as it had corrupted information 
#GA_data_replace_na.drop(GA_data_replace_na[GA_data_replace_na["Audience"] == 22270].index, inplace = True)

# trying this 
new_GA = GA_data_replace_na[GA_data_replace_na.Audience != 'fivedisparate']

In [16]:
# Changing NA values in audience column to 0
new_GA[["Audience"]] = new_GA[["Audience"]].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [17]:
# Checking unique values in audience column 
col_list(new_GA, 'Audience')

List of values in < Audience > : 
 ['6' '4' '2' '1' '5' '3' 0] 



Now checking other columns for unique values

In [18]:
col_list(new_GA, 'Location')

List of values in < Location > : 
 ['Ashburn ' 'Mebane ' 'Chicago ' ... 'Feasterville-Trevose ' 'Glenshaw '
 'Kerhonkson '] 



In [19]:
col_list(new_GA, 'Campaign')

List of values in < Campaign > : 
 ['FY23_change_digital_phase3' 'FY23_broker_campaign' 'Brand_Exact'
 'FY22_broker_campaign_ph2' 'NB_Wholesale_Phrase' 'Brand_Phrase'
 'FY22_broker_campaign' 'NB_Wholesale_Exact' 'FY22_broker_campaign_ph6'
 'FY22_broker_campaign_ph3'] 



In [20]:
col_list(new_GA, 'Platform')

List of values in < Platform > : 
 ['Trade Media' 'User ID Display' 'Domain Display' 'Google SEM' 'LinkedIn'
 'Facebook' nan] 



In [21]:
col_list(new_GA, 'Ad_Format')

List of values in < Ad_Format > : 
 ['Inside_mortgage_finance_newsletter' nan 'Single image' 'Video'
 'Animated' 'CPC' 'Carousel' 'Housingwire' 'Scotsman'
 'National_mortgage_news' 'Nmn'] 



In [22]:
col_list(new_GA, 'Creative_Family')

List of values in < Creative_Family > : 
 ['CloserTwins' 'UnfairAdvantage' 'SEM Ads' 'CloseFaster' 'Trade Media Ads'
 'CompetitiveOpportunity' 'newsletter'] 



In [23]:
col_list(new_GA, 'Creative_Version')

List of values in < Creative_Version > : 
 ['RTB' 'OnePage' 'NoDTI' 'MoreLoansAll' 1099.0 'Cutdown1A' 'Animated'
 'Change Wholesale' 'All' '3steps' 'ROS1' 'Faceoff' 'CloseMore' 'MoreAll'
 'FasterAll' 'FasterReg' 'MoreNoReg' 'Competitors' 'Rate Lock' 'Cutdown1B'
 '1page' 'Faceoff1' 'Theycancelweclose' "We Are America's CDFI" 'Namaste'
 'Faceoff2' 'Cancelingyourlock' 'Paperwork' 'ShapeUp' 'EarlyBird'
 'Close More. Close Faster.' 'ROS5' 'Faster' 'interactive' '300x250'
 'MoreLoans' '728x90' 'wallpaper' 'ROS2'] 



In [24]:
col_list(new_GA, 'Total_Sessions')

List of values in < Total_Sessions > : 
 [ 1  2  9  4  3 14  5  8  6  7 13 15 11 17 16 12 10 19 18 21 20] 



In [25]:
new_GA

Unnamed: 0,Location,Audience,Campaign,Date,Platform,Ad_Format,Creative_Family,Creative_Version,Total_Sessions,Days_Max_Date,Latest_Report
0,Ashburn,6,FY23_change_digital_phase3,2022-08-08,Trade Media,Inside_mortgage_finance_newsletter,CloserTwins,RTB,1,84,0
1,Mebane,4,FY23_broker_campaign,2022-06-16,User ID Display,,UnfairAdvantage,OnePage,1,137,0
2,Chicago,2,FY23_broker_campaign,2022-06-05,Domain Display,Single image,UnfairAdvantage,NoDTI,2,148,0
3,South Jordan,1,FY23_broker_campaign,2022-09-02,User ID Display,,CloserTwins,MoreLoansAll,1,59,0
4,Potsdam,4,FY23_broker_campaign,2022-10-30,Domain Display,,UnfairAdvantage,1099.0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
47524,Pearland,4,FY23_broker_campaign,2022-10-18,Domain Display,,UnfairAdvantage,OnePage,1,13,1
47525,Springfield,4,FY23_broker_campaign,2022-06-10,Domain Display,,UnfairAdvantage,OnePage,1,143,0
47526,Boston,5,FY23_broker_campaign,2022-10-21,Domain Display,Single image,UnfairAdvantage,1099.0,1,10,1
47527,Cleveland,5,FY22_broker_campaign_ph2,2022-05-20,LinkedIn,Single image,UnfairAdvantage,1page,1,164,0


## Create Target Group Column

In [26]:
new_GA['Target_Group'] = new_GA.loc[:, 'Audience']

new_GA.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_GA['Target_Group'] = new_GA.loc[:, 'Audience']


Unnamed: 0,Location,Audience,Campaign,Date,Platform,Ad_Format,Creative_Family,Creative_Version,Total_Sessions,Days_Max_Date,Latest_Report,Target_Group
0,Ashburn,6,FY23_change_digital_phase3,2022-08-08,Trade Media,Inside_mortgage_finance_newsletter,CloserTwins,RTB,1,84,0,6
1,Mebane,4,FY23_broker_campaign,2022-06-16,User ID Display,,UnfairAdvantage,OnePage,1,137,0,4
2,Chicago,2,FY23_broker_campaign,2022-06-05,Domain Display,Single image,UnfairAdvantage,NoDTI,2,148,0,2
3,South Jordan,1,FY23_broker_campaign,2022-09-02,User ID Display,,CloserTwins,MoreLoansAll,1,59,0,1
4,Potsdam,4,FY23_broker_campaign,2022-10-30,Domain Display,,UnfairAdvantage,1099.0,1,1,1,4


In [27]:
# Replace Audience '1,2,3' as '1' in Target_Group
new_GA['Target_Group'] = new_GA['Target_Group'].replace(['1', '2', '3'], '1')

# Replace Audience '4,5' as '2' in Target_Group
new_GA['Target_Group'] = new_GA['Target_Group'].replace(['4', '5'], '2')

# Replace Audience '6' as '3' in Target_Group
new_GA['Target_Group'] = new_GA['Target_Group'].replace(['6'], '3')

# Check Target Group values
new_GA['Target_Group'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_GA['Target_Group'] = new_GA['Target_Group'].replace(['1', '2', '3'], '1')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_GA['Target_Group'] = new_GA['Target_Group'].replace(['4', '5'], '2')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_GA['Target_Group'] = new_GA['Target_Group'].replac

array(['3', '2', '1', 0], dtype=object)

## Save cleaned dataframe as csv

In [28]:
new_GA.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47528 entries, 0 to 47528
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Location          47528 non-null  object        
 1   Audience          47528 non-null  object        
 2   Campaign          47528 non-null  object        
 3   Date              47528 non-null  datetime64[ns]
 4   Platform          47527 non-null  object        
 5   Ad_Format         29518 non-null  object        
 6   Creative_Family   47528 non-null  object        
 7   Creative_Version  47528 non-null  object        
 8   Total_Sessions    47528 non-null  int32         
 9   Days_Max_Date     47528 non-null  int32         
 10  Latest_Report     47528 non-null  int32         
 11  Target_Group      47528 non-null  object        
dtypes: datetime64[ns](1), int32(3), object(8)
memory usage: 4.2+ MB


In [29]:
# Create a CSV file as output.
new_GA.to_csv(r'ga-ma_cleaned.csv', index=False)