In [9]:
from pytrends.request import TrendReq 
from datetime import date, timedelta
import pandas as pd
import matplotlib.pyplot as plot
import numpy as np
from math import ceil

This script is for counting "relevance scores". The current score calculation is rather vague. Scores are compareable only within one geo location.

TODO: 
- Check the normalization words (especially the global's)
- Check the month span to be used 
- Build a pipeline that calculates the score for all the blogs (using preferably some extracted keywords etc.) 
- For final version, remove all the unnecessary print commands



In [2]:
# Helper function, that counts the the "relevance score" for one 4-word-batch
# Input_kw,         a list containing max 4 keywords
# search year,      int
# search month,     int
# comp_word,        Comparison word(str) that will be used to get google to somewhat normalize to same standards 
# geo,              two letter abbrevation (str) for geo location
def count_relevance_score_single(input_kw, search_year, search_month, comp_word, geo, return_score = True):
    # "Constants" that could be attuned
    months_before = 2       # months to take account
    months_after  = 0

    comp_time = "2010-01-01 " + date.today().isoformat() 
    kw_list = [comp_word] + input_kw

    # Quering the data from google
    # If Google returns code 429, it means that the code is ok, but google didn't want to co-operate for some reason. Solved by retrying. If that doesn't work wait 60s and try again.
    # Code 400 means probably, that something was wrong with the input parameters
    pytrends = TrendReq(hl='en-US', tz=-120, timeout=(10,25), retries = 4, backoff_factor=0.1)
    pytrends.build_payload(kw_list, timeframe=comp_time, geo = geo)
    loc_df = pytrends.interest_over_time()

    # Finding correct indices
    index = np.where((loc_df.index.year ==  search_year) & (loc_df.index.month == search_month))

    if len(index) < 1:
        print("ERROR: comparison data not found for specified year and month")

    if len(loc_df[loc_df[comp_word] > 95]) < 1:
        print("WARNING: Score is too small and won't give good explanation!")
        print("For words:" + str(kw_list) + " the found maximum is " + str(loc_df[comp_word].max()) + " , when it should be 100.")

    index = int(index[0])

    if return_score:
        # count sum and normalize it by dividing it with the number of months and number of keywords
        return loc_df.iloc[index-months_before : index + months_after + 1, 1:-1].sum(axis=0).sum()/((months_before+months_after+1)*(len(input_kw)))

    return loc_df.iloc[index-months_before : index + months_after + 1, 1:-1]

In [3]:
# Helper function, that splits the keyword list into max 4 word chuncks
def count_relevance_score_multi(input_kw, search_year, search_month, comp_word, geo, return_score = True):
    iterations_num = ceil(len(input_kw)/4.0)

    if return_score:
        score = 0
        for index in range(iterations_num):
            score += count_relevance_score_single(input_kw[index*4 : min((index+1)*4, len(input_kw))], search_year, search_month, comp_word, geo)
        return score/iterations_num

    return_df = pd.DataFrame([])

    for index in range(iterations_num):
        return_df = pd.concat([return_df, count_relevance_score_single(input_kw[index*4 : min((index+1)*4, len(input_kw))], search_year, search_month, comp_word, geo)], axis=1)

    return return_df


In [4]:
"""
Function that counts the "relevance score" based on google trends data
inputs:     Input_kw:                          - List of keywords
            search_year                        - Int
            search_month                       - Int
            Geos options: all                  - Will use all countries Futurice has currently office
                        {FI, SE, NO, DE, GB}   - Will use only the country the abbrevation is refering to
                        ""                     - Will use the default, which is whole world. WARNING: Might not give any meaningfull results 
            return_score = True                - if True, returns a float, else returns a dataframe with specified months and keywords. Prefer True, as False is mainly for debuging and more error prone"""           

def count_relevance_score(input_kw, search_year, search_month, geos, return_score = True):
    geos_dict = {"FI" : "Finland", "SE" : "Sweden", "NO": "Norway", "DE": "Germany", "GB" : "English"} # Location abbreavations and their corresponding default "normalization" words

    if geos == "":
        return count_relevance_score_multi(input_kw, search_year, search_month, "Finland", "", return_score=return_score)

    if geos == "all":
        score = 0
        for item in geos_dict:
            score += count_relevance_score_multi(input_kw, search_year, search_month, geos_dict[item], item)
        return score/len(geos_dict)
            

    if geos not in geos_dict:
        print("Given country abbrevation was incorrect! \nReturning no results.")
        return

    return count_relevance_score_multi(input_kw, search_year, search_month, geos_dict[geos], geos, return_score=return_score)

    




Some grude tests.

Remove rest of this notebook to transform this to regular python script.

In [5]:
count_relevance_score(["Passion", "AI" ], 2022, 6, "all", return_score = True)

4.033333333333333

In [6]:
for item in ["FI", "SE", "NO", "DE", "GB"]:
    print(str(item) + ": " + str(count_relevance_score(["Passion", "AI" ], 2022, 6, item, return_score = True)))

FI: 3.3333333333333335
SE: 3.5
NO: 2.3333333333333335
DE: 6.833333333333333
GB: 4.166666666666667


In [7]:
count_relevance_score(["Passion", "AI", "Gaming", "Tech", "Management", "Python", "Ruby", "C#", "Nobody"], 2022, 6, "all", return_score = True)

5.911111111111111

In [8]:
count_relevance_score(["Passion", "AI", "Gaming", "Tech", "Management", "Python", "Ruby", "C#", "Nobody"], 2022, 6, "", return_score = True)

For words:['Finland', 'Passion', 'AI', 'Gaming', 'Tech'] the found maximum is 18 , when it should be 100.
For words:['Finland', 'Management', 'Python', 'Ruby', 'C#'] the found maximum is 12 , when it should be 100.


47.02777777777777