## Re-Working Clavis 

In [1]:
## This is Clavis
## Clavis is a Keyword Expansion and Categorization Tool for Digital Marketing & URL Generation

## For example if you have a keyword like "best shoes" and you want to expand it to "best shoes for cycling" and "best shoes for running"
## as well as categorize whether it is "Male", "Female", "Unisex" or "Kids" you can do that with Clavis
## Also identify whether a keyword is Generic or Branded (Generic = "best shoes", Branded = "Nike shoes")
## also it identifies the Topic of the keyword ("Accessories", "Cycling Jackets", "Mountain Biking Jackets", etc..)
## lastly it will also identify which ones should be "Deleted" and which ones should be "Kept"

## The input to this system are two different CSV Files:

## Input 1: Keywords.csv
## Keywords File: This is a CSV file with the following columns: Keywords
## Keywords File Example:
## Keywords: Best shoes, running shoes, etc..

## Input 2: Categories.csv
## Categories File: This is a CSV with the following columns: Category, Search For, Return
## Categories File Example:
## Category | Search For | Return
## Gender | | Unisex --> If Search For is empty, return Unisex
## Gender | man | Men
## Gender | woman | Women
## Gender | lady | Women
## Gender | boy | Kids
## Branded_generic | | Generic --> If Search For is empty, return Generic
## Branded_generic | nike | Branded
## Branded_generic | adidas | Branded
## Branded_generic | puma | Branded
## Branded_generic | reebok | Branded
## Topic | | None --> If Search For is empty, return None
## Topic | cleats | accessories
## topic | vest | Gilet
## Topic | jacket*cycling*mountain*jacket | mountain biking jackets --> asterisk is a wildcard character, it can be used to match multiple words

## Output:
## Output is a CSV file with the following columns: Keywords, Gender, Branded_Generic, Topic, Delete_Keep
## Output Example:
## Keywords | Gender | Branded_Generic | Topic | Delete_Keep
## Best shoes | Unisex | Generic | None | Keep
## Best shoes for cycling | Unisex | Generic | cycling | Keep
## Best shoes for running | Unisex | Generic | running | Keep
## Nike shoes | Unisex | Branded | None | Keep
## Nike shoes for cycling | Unisex | Branded | cycling | Keep
## Nike shoes for running | Unisex | Branded | running | Keep
## Adidas shoes | Unisex | Branded | None | Keep
## women's cycling suit | Women | Generic | cycling | Keep


## PART 3: URL Generation
"""

Product - URL	Model - URL	Style - URL	Gender - URL	Sport - URL	Best for - URL	Colour - URL	Features - URL	Collection - URL	Brand - URL	Size - URL	Rise - URL	Sustainable - URL	Material - URL	Teams - URL	Kit Teams - URL	Winter - URL	Outlet - URL	Support - URL	Length - URL	Fit - URL	Surface - URL	Techologies - URL
"""
## those are all the mappings that are available for the URL Generation
## the matching will be done for each of the categories and then concatenated with a custom separator (default is "-")

## There are two parts to this:
## Part 1. Keyword Search Volume & Expansion This is done with the custom scripts we have under KeywordSearchVolume module [DONE]
## Part 2. Keyword Categorization This is done in the aforementioned fashion from the two CSV Files [DONE]
## Part 3. URL Generation - This is done with an extra mapping file that maps the keywords to the URLs

'\n\nProduct - URL\tModel - URL\tStyle - URL\tGender - URL\tSport - URL\tBest for - URL\tColour - URL\tFeatures - URL\tCollection - URL\tBrand - URL\tSize - URL\tRise - URL\tSustainable - URL\tMaterial - URL\tTeams - URL\tKit Teams - URL\tWinter - URL\tOutlet - URL\tSupport - URL\tLength - URL\tFit - URL\tSurface - URL\tTechologies - URL\n'

In [2]:
## reload 
%reload_ext autoreload
%autoreload 2

In [3]:
## importing the libraries
import os
import streamlit as st
import pandas as pd
import numpy as np

## custom modules
from KeywordSearchVolume.search_volume_extractor import run_search_volume
from clavis_helpers import (
    does_file_exist,
    load_excel_and_clean,
    get_payload,
    clean_categories_df,
    categorize_keywords,
    clean_dataframe_for_categorization,
    apply_clean_dataframe_for_categorization
)

In [4]:
SAVE_DIR = "./sample_data"

## Part 1. Keyword Search Volume & Expansion
LANGUAGE = "English"  ## this has to be a streamlit input
EXPAND_KEYWORDS = True  ## this has to be a streamlit radio button
GEOLOCATION = "United States"  ## this has to be a streamlit input

## CLAVIS CONFI FILES
CLAVIS_CONFIG_NAME = "Keyword-Categorization-Mapping-Config.xlsx"
CLAVIS_CONFIG_SHEET_NAME = "Config - Categorisation"
CLAVIS_CONFIG_FILE_PATH = os.path.join(SAVE_DIR, CLAVIS_CONFIG_NAME)

## KEYWORDS AND CATEGORY FILES
KEYWORDS_FILE_NAME = "Keywords.csv"
KEYWORDS_FILE_PATH = os.path.join(SAVE_DIR, KEYWORDS_FILE_NAME)

In [5]:
## Part 0. Load the files
catz = pd.read_csv(
    f"{SAVE_DIR}/categories.csv", sep=";"
)
KEYWORDS_FILE = pd.read_csv(KEYWORDS_FILE_PATH)

## get the payload
PAYLOAD = get_payload(KEYWORDS_FILE, LANGUAGE, EXPAND_KEYWORDS, GEOLOCATION)

In [6]:
## get the results from the search volume extractor
## check if the intermediate file exits to save time
if does_file_exist(f"{SAVE_DIR}/intermediate_results.csv"):
    results_df = pd.read_csv(f"{SAVE_DIR}/intermediate_results.csv")
else:
    results = run_search_volume(**PAYLOAD)
    ## parse the results as it is a dictionary, and we need to label those keywords which are "ideas" and were not in the original list
    ## we will label them as "idea" and "not idea" respectively
    results_df = (
        pd.DataFrame.from_dict(results, orient="index", columns=["search_volume"])
        .reset_index()
        .rename(columns={"index": "keywords"})
    )

    ## add a column to the dataframe to label the keywords as "idea" or "not idea"
    results_df["idea"] = np.where(
        results_df["keywords"].isin(PAYLOAD["keywords"]), "Expanded", "Original"
    )

    ## expansion factor - is the number of keywords that were generated by the search volume extractor
    ## divided by the number of keywords that were in the original list
    expansion_factor = len(results_df) / len(PAYLOAD["keywords"])
    print(f"The expansion factor is {expansion_factor:.2f}x")

## save the intermediate file to avoid re-running the above code
if does_file_exist(f"{SAVE_DIR}/intermediate_results.csv"):
    pass
else:
    results_df.to_csv(f"{SAVE_DIR}/intermediate_results.csv", index=False)

In [8]:
## load the clavis config file 
clavis_config = load_excel_and_clean(CLAVIS_CONFIG_FILE_PATH, CLAVIS_CONFIG_SHEET_NAME)

## function to perform the above steps
separated_categories_dict = clean_categories_df(clavis_config)

## cleaned dataframe 
cleaned_categories_df = apply_clean_dataframe_for_categorization(separated_categories_dict)

## separate the URL generation from the categorization
url_generation_df = cleaned_categories_df[cleaned_categories_df['Category'].str.contains('URL')]

## separate the categorization from the URL generation
categorization_df = cleaned_categories_df[~cleaned_categories_df['Category'].str.contains('URL')]

categorized_keywords = categorize_keywords(results_df, categorization_df)