In [200]:
import pandas as pd
import numpy as np
from pytrends.request import TrendReq 
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity 

In [201]:
# Fetches the missing keywords from google trends
def get_google_trends_to_csv(keyword, df, df_index, df_name = "../data/trend_single_score.csv"):
    comp_time = "2010-01-01 2022-11-19"
    pytrends = TrendReq(hl='en-US', tz=-120, timeout=(10,25), retries = 4, backoff_factor=10)
    pytrends.build_payload([keyword], timeframe=comp_time, geo = "")
    loc_df = pytrends.interest_over_time()

    if(len(loc_df) < 1):
        loc_df = pytrends.interest_over_time()
        if(len(loc_df) < 1):
            with open("../data/lift_score_unpopular_google_searches", "a") as file:
                file.write(str(df_index) + ": " + keyword +"\n")
                return df, False

    loc_df = loc_df.drop(columns="isPartial")
    df = pd.concat([df, loc_df], axis=1)

    df.to_csv(df_name)

    return df, True

#Calculates the lift score = has the word been more or less trendy this month than on average within the last year
def get_lift(keyword, df, year, month, df_index):
    if keyword not in df.columns:
        print("    adding " + keyword + " to data base.")
        df, succeeded = get_google_trends_to_csv(keyword, df, df_index)
        if not succeeded:
            print("    adding failed.")
            return df, 1

    end_index = int(np.where((df.index.year ==  year) & (df.index.month == month))[0])
    start_index = int(np.where((df.index.year ==  year-1) & (df.index.month == month))[0])

    month_score = df.iloc[end_index][keyword]
    mean = df.iloc[start_index:end_index][keyword].mean()

    return df, month_score/(max(mean, 1))


    

In [202]:
# Maximal Marginal Relevance
# Returns top_n best keywords
def mmr(doc_embedding, word_embeddings, words, top_n, diversity):

    # Extract similarity within words, and between words and the document
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
    word_similarity = cosine_similarity(word_embeddings)

    # Initialize candidates and already choose best keyword/keyphras
    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        # Extract similarities within candidates and
        # between candidates and selected keywords/phrases
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # Calculate MMR
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # Update keywords & candidates
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx]

In [203]:
# Preprocesses text and call mmr
# Returns top_n keywords
def get_mmr_keywords(doc, top_n=5):

    n_gram_range = (1,1)
    count = CountVectorizer(ngram_range=n_gram_range, stop_words="english").fit([doc])
    candidates = count.get_feature_names_out()

    model = SentenceTransformer('distilbert-base-nli-mean-tokens')
    doc_embedding = model.encode([doc])
    candidate_embeddings = model.encode(candidates)

    #top_n = 10
    #distances = cosine_similarity(doc_embedding, candidate_embeddings)
    #keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
    #keywords
    if len(candidates) <= top_n:
        return candidates
     
    return mmr(doc_embedding, candidate_embeddings, candidates, top_n=top_n, diversity=0.2)


# The main loop

In [204]:
df = pd.read_csv("../data/blogs_with_analytics.csv", sep="\t", parse_dates=["time"], infer_datetime_format=True, index_col=["index"])
keyword_df = pd.read_csv("../data/trend_single_score.csv", parse_dates=["date"], infer_datetime_format=True, index_col=["date"])

for index in range(772, len(df)):
    keywords = get_mmr_keywords(df.iloc[index]["text"])
    print(str(index) + ": " + str(keywords))
    
    lift_sum = 0

    for word in keywords: 
        keyword_df, lift = get_lift(word, keyword_df, int(df.iloc[index]["time"].year), int(df.iloc[index]["time"].month), index)
        lift_sum += lift
        df.at[index, "mmr_lift"] = lift_sum

    df.to_csv("../data/blogs_with_analytics.csv", sep="\t")



712: ['tuesday', 'finnish', 'female', 'economist', 'teenagers']
713: ['google', 'javascript', 'spaghetti', 'superpowers', 'apps']
714: ['startups', 'startup', 'mvps', 'innovators', 'innovation']
715: ['coaching', 'riskier', 'compete', 'customers', 'help']
    adding riskier to data base.
    adding compete to data base.
    adding help to data base.
716: ['festival', 'iphone', 'apps', 'android', 'gig']
    adding gig to data base.
    adding failed.
717: ['champion', 'august', 'workplace', 'competitive', 'daily']
718: ['spring', 'graduated', 'magazine', 'assistant', 'studies']
    adding magazine to data base.
    adding failed.
    adding assistant to data base.
    adding failed.
    adding studies to data base.
    adding failed.
719: ['campers', 'magic', 'download', 'caring', 'reward']
    adding campers to data base.
    adding caring to data base.
    adding reward to data base.
    adding failed.
720: ['june', 'colleagues', '2008', 'corporate', '2013']
    adding 2008 to data ba

TypeError: only size-1 arrays can be converted to Python scalars

# Rest of the file is for debuging

In [205]:
get_mmr_keywords(df.iloc[772]["text"])

['server', 'employees', 'employee', 'software', 'servers']

In [206]:
df.iloc[772]

url                                        blog/why-personal-virtual-servers
title                                          Why personal virtual servers?
time                                                     2010-12-27 00:00:00
category                                                             Culture
description                As Olli mentioned before the holidays, our IT ...
text                       As Olli mentioned before the holidays, our IT ...
introduction               ['As', 'Olli', 'mentioned', 'before', 'the', '...
author                                                         Arttu Tolonen
author_job_title                                         Communications Lead
pageviews                                                                  5
unique_pageviews                                                           3
avg_time                                                               173.0
bounce_rate                                                              0.0

In [149]:
df["mmr_lift"][0:639].info()

<class 'pandas.core.series.Series'>
RangeIndex: 639 entries, 0 to 638
Series name: mmr_lift
Non-Null Count  Dtype  
--------------  -----  
639 non-null    float64
dtypes: float64(1)
memory usage: 5.1 KB


In [122]:
comp_time = "2010-01-01 2022-11-19"
pytrends = TrendReq(hl='en-US', tz=-120, timeout=(10,25), retries = 4, backoff_factor=10)
pytrends.build_payload(["mobprogramming"], timeframe=comp_time, geo = "")
loc_df = pytrends.interest_over_time()
loc_df

In [198]:
temp_df = pd.read_csv("../data/blogs_with_analytics.csv", sep="\t", parse_dates=["time"], infer_datetime_format=True, index_col=["index"])
#temp_df = temp_df.drop(columns=temp_df.columns[0:1])
#temp_df.to_csv("../data/blogs_with_analytics.csv", sep="\t")
temp_df

Unnamed: 0_level_0,url,title,time,category,description,text,introduction,author,author_job_title,pageviews,...,exit%,semantic neg score,semantic neu score,semantic pos score,semantic compound score,average_sentence_length,dale_chall,flesch,average_stopword,mmr_lift
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,blog/futustories-six-reasons-pasi-left-and-cam...,FutuStories - Six reasons Pasi left – and came...,2022-09-16,Culture,"For Senior Cloud Consultant Pasi, a change can...",1. I need awesome people around me…\r\nI’d say...,"For Cloud Archtitect Pasi, a change can be as ...",Pia Hämäri,"Marketing Lead, Finland",91,...,0.527473,0.053,0.760,0.187,0.9990,20.400000,6.88,75.84,8.707317,2.898231
1,blog/foresight-methods-and-strategic-planning,Foresight methods and strategic planning in bu...,2022-09-13,Strategy,Foresight methods and strategic planning lead ...,This is where foresight methods and strategic ...,"If the past few years have taught us anything,...",Annina Antinranta,Principal Designer - Emerging Business,33,...,0.272727,0.020,0.849,0.131,0.9985,17.888889,7.99,45.05,7.166667,6.906866
2,blog/uncertainty-in-business-volatile-market,Uncertainty in business and how to deal with it,2022-09-12,Opinion,"Future uncertainty, how to deal with uncertain...",The silver lining to all this doom and gloom i...,"Looming global threats like war, recession and...",Andreas Lindqvist,"Business Director, Futurice",28,...,0.571429,0.193,0.704,0.103,-0.7525,35.500000,11.40,43.90,17.000000,5.223087
3,blog/futustories-emma-leena-heikkinens-story,FutuStories – Emma-Leena Heikkinen’s story,2022-09-01,Culture,To be leader is not naturally given. Emma-Leen...,What does your role involve?\r\nI’m a client l...,"Human connections, honesty and trust are impor...",Pia Hämäri,"Marketing Lead, Finland",180,...,0.672222,0.031,0.789,0.180,0.9993,19.195652,7.52,68.50,9.847826,3.941215
4,blog/safe-route-uncertain-times,The Safe Route project and how it relates to d...,2022-08-26,Opinion,Good quality data used in the right way is at ...,Safe Route uses data from STRADA - a database ...,Safe Route was conceived as a new way to think...,Sonja Lakner,"Managing Director, Sweden",105,...,0.609524,0.065,0.711,0.224,0.9995,31.666667,9.07,39.20,14.259259,5.028579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
780,blog/make-up-simple-solutions-that-fit-your-or...,Make up simple solutions that fit your organiz...,2010-05-10,Culture,Hello Mikko Viikari! You made a presentation a...,Hello Mikko Viikari! You made a presentation a...,"['Hello', 'Mikko', 'Viikari!', 'You', 'made', ...",Anni Tölli,"Account Manager, Consultant",7,...,0.285714,0.004,0.750,0.245,0.9978,12.076923,8.53,57.57,4.384615,-1.000000
781,blog/pekka-tarjanne-1937-2010,Pekka Tarjanne - 1937-2010,2010-03-19,News,With deep sorrow we announce the loss of our f...,With deep sorrow we announce the loss of our f...,"['With', 'deep', 'sorrow', 'we', 'announce', '...",Tuomas Syrjänen,"Co-founder, AI Renewal",10,...,0.500000,0.075,0.669,0.256,0.9300,14.750000,10.53,56.45,4.250000,-1.000000
782,blog/user-testing-the-ultimate-reality-check,"User testing - the ultimate ""reality check""",2010-03-12,Ways of Working,"Last week, after a couple of months of design ...","Last week, after a couple of months of design ...","['Last', 'week,', 'after', 'a', 'couple', 'of'...",Matti Parviainen,User interface & Concept Designer,6,...,0.000000,0.036,0.813,0.151,0.9793,16.333333,8.39,63.39,7.416667,-1.000000
783,blog/quality-time-session-based-testing-faq,Quality Time: Session-based testing FAQ,2010-02-26,Learning,What is session based testing? Session based t...,SESSION BASED TESTING F.A.Q.\r\nWhat is sessio...,"['SESSION', 'BASED', 'TESTING', 'F.A.Q.', 'Wha...",Arttu Tolonen,Communications Lead,4,...,0.000000,0.000,0.895,0.105,0.9765,12.090909,8.78,59.09,4.181818,-1.000000


In [199]:
temp_df["mmr_lift"] = temp_df["mmr_lift"].fillna(-1.0)
temp_df.info()
temp_df.to_csv("../data/blogs_with_analytics_backup.csv", sep="\t")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 785 entries, 0 to 784
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   url                      785 non-null    object        
 1   title                    785 non-null    object        
 2   time                     785 non-null    datetime64[ns]
 3   category                 785 non-null    object        
 4   description              778 non-null    object        
 5   text                     785 non-null    object        
 6   introduction             785 non-null    object        
 7   author                   785 non-null    object        
 8   author_job_title         785 non-null    object        
 9   pageviews                785 non-null    int64         
 10  unique_pageviews         785 non-null    int64         
 11  avg_time                 785 non-null    float64       
 12  bounce_rate              785 non-nul