In [4]:
import pandas as pd
import itertools
import networkx as nx
from networkx.algorithms.traversal.depth_first_search import dfs_tree
import boto3
import re
import os
from datetime import datetime

In [7]:
##set for test run in Jupyter
pd.set_option ("max_colwidth", 100)
os.environ ["INPUTPATH"] = r"C:\Python\adobe\data\data.tsv"
os.environ["OUTPUTPATH"] = r"C:\Python\adobe\data\output"

In [8]:
# for reading from s3
# sess = boto3. session. Session (region name='us-east-1')
# os.environ ["INPUTPATH"] = r"s3://bs-workspace/misc/data.tsv"
# os. environ ["OUTPUTPATH"] = r"s3://bs-workspace/misc"
# df - read_csv (path='s3://bs-workspace/misc/data.tsv
#                 sep="\t",
#                 boto3_session = sess)

In [9]:
todays_date = datetime.strftime(datetime.now(), "%Y-%m-%d")

In [24]:
output_path = os.environ.get("OUTPUTPATH")
input_path = os.environ.get("INPUTPATH")
retailer_domain = "www.esshopzilla.com"
checkout_page = "https://www.esshopzilla.com/checkout/?a=complete"

In [15]:
output_file_qualified_path = f"{output_path}/{todays_date}_SearchKeywordPerformance.tab"

In [19]:
df = pd.read_csv(input_path, sep="\t")

In [25]:
def _parse_product_list(data):
    revenue = 0.00
    try:
        products = data.split(",")
        for each_product in products:
            product_attrs = each_product.split(";")
            revenue += float(product_attrs[3]) if product_attrs[3] else 0.00
    except AttributeError:
        pass
    return revenue

def _parse_referrer(data):
    pattern = r"^http\S*//\w+.(\w+.\w+)/"
    matches = re.match(pattern, data.strip())
    return matches.group(1)

def _parse_search_str(data):
    # google search string pattern
    pattern = r"q=(\w+)"
    matches = re.search(pattern, data.strip())
    if matches:
        pass
    else:
        # yahoo search string pattern
        pattern = r"p=(\w+)"
        matches = re.search(pattern, data.strip())
    return matches.group(1)

In [27]:
if len(df) > 1:
    ip_unique = df.ip.unique()
    referrer_list = []

    for each_ip in ip_unique:
        df_per_ip = df[df["ip"] == each_ip].reset_index()
        df_per_ip["revenue"] = df_per_ip.product_list.apply(
            _parse_product_list
        )

        g = nx.DiGraph()
        for _i, row in df_per_ip.iterrows():
            g.add_edge(row["referrer"], row["page_url"], revenue=row["revenue"])

        revenue = nx.get_edge_attributes(g, "revenue")

        for _i, row in df_per_ip.iterrows():
            if retailer_domain in row["referrer"]:
                pass
            else:
                revenue_amt = 0.00
                x = dfs_tree(g, row["referrer"])
                for u, v in list(x.edges()):
                    if checkout_page in v:
                        revenue_amt += revenue[(u, v)]
                referrer_list.append(
                    {"Referrer": row["referrer"], "Revenue": revenue_amt}
                )

In [29]:
revenue_df = pd.DataFrame(referrer_list)

In [30]:
revenue_df.insert(
                0,
                "Search Engine Domain",
                revenue_df.Referrer.apply(_parse_referrer),
            )

In [31]:
revenue_df.insert(
                1, "Search Keyword", revenue_df.Referrer.apply(_parse_search_str)
            )

In [32]:
revenue_df = revenue_df.drop("Referrer", axis=1).sort_values(
                by=["Revenue"], ascending=False
            )

In [33]:
revenue_df.head()

Unnamed: 0,Search Engine Domain,Search Keyword,Revenue
0,google.com,Ipod,290.0
1,bing.com,Zune,250.0
2,yahoo.com,cd,0.0
3,google.com,ipod,0.0


In [34]:
revenue_df.to_csv(
                output_file_qualified_path,
                sep="\t",
                index=False,
                header=True,
            )