In [None]:
import pandas as pd
import re

## Set-up


In order to categorize the “How did you hear about us” responses more automatically, I am going to use a key dictionary of terms, previously used in our data collected over the last few years, to help me categorize the “How did you hear about us” responses.


In [None]:
### Convert excel into dictionary ###
df = pd.read_excel("SRA_categories.xlsx")

# Initialize an empty dictionary
result_dict = {}

for index, row in df.iterrows():

    key = row[0]
    values = row[1:].dropna().tolist()
    result_dict[key] = values

# Print the resulting dictionary
print(result_dict)

{'InternetResearch': ['Internetrecherche', 'Online', 'recherche internet', 'Internet', 'webseit', 'Site web', 'Internetrecheche', 'Internet', 'Web', 'Eigenrecherche', 'www', 'Websuche', 'safari', 'Suchmaschine', 'Net', 'Browsing'], 'Google': ['Google', 'Google search', 'Online Google', 'Google Recherche', 'Google Suche', 'google.com', 'Googlesuche'], 'SocialMedia': ['Social Media', 'Facebook', 'In Social Media', 'fb', 'socialmedia', 'LinkedIn'], 'Search': ['Search Engine', 'Onlinesuche', 'searching', 'search', 'Web Search', 'Internet Suche', 'screening', 'Websuche', 'Buscando', 'suche', 'engine'], 'Colleague': ['Kollegen', 'team', 'travail', 'Collegue', ' Arbeitskollegin', 'colleague', 'kollege', 'meinem'], 'Gartner': ['Gartner', 'Gartner Magic Quadrant', 'Gardner Report', 'Gartner-Matrix', 'Gartner research on AI-enabled KM', 'Internet Recherche / Gartner Consulting', 'Quadrant'], 'Research': ['Recherche', 'Im Rahmen eines Projekt', 'DMS ', 'Market research', 'DMS Branchenerfahrung', 

  key = row[0]


In [None]:
# Defining categories
categories = result_dict

Adjusting date format

In [None]:
df = pd.read_excel("analyst-forms-01.04.2025.xlsx")

df["Date created"] = pd.to_datetime(df["Date created"], format="%d.%m.%Y %H:%M").dt.date
#df[["How did you hear about us", "Date"]].head()
df.to_excel("analystforms-01.04.2025.xlsx", index=False)

## Categorization


Two functions:
1. clean_text: will remove punctuation and convert lowercase
2. classify_response: it will check each response to all keywords from each category, and then the category with the highest match count will be assigned to the response (by default the "Other" is assigned)

In [None]:


### Clean text Function ###
def clean_text(text):
  if not isinstance(text, str):
    text = str(text)
  text = text.lower() # convert to lowercase
  text = re.sub(r'[^\w\s]', "", text) #remove punctuation
  return text

### CLassify a single response function ###
def classify_response(response, keyword_dict):
  response_lower = clean_text(response)

  # Initialize as "Other"
  assigned_category = "Other"
  highest_match_count = 0

  # for each category, check how many keywords are found
  for category, keywords in keyword_dict.items():
    match_count = 0
    for kw in keywords:
      # clean and remove punctuation from the keyword
      kw_cleaned = clean_text(kw)
      # Check if the keyword is present in the response
      if kw_cleaned in response_lower:
        match_count += 1

    # if we have more matches in this category than a previous one
    if match_count > highest_match_count:
      highest_match_count = match_count
      assigned_category = category

  return assigned_category

In [None]:
# MAIN SCRIPT

def main():

  #Create a new column "SRA" for the classification
  df["SRA"] = df["How did you hear about us"].apply(
      lambda x: classify_response(x, categories)) # x: each response from the data

  # Save a new .csv
  output_file = "SRA_classified.xlsx"
  df.to_excel(output_file, index=False)
  print(f"Classified data saved to {output_file}")


Classified data saved to SRA_classified.xlsx


In [None]:
if __name__ == "__main__":
  main()