In [None]:
import pandas as pd
import numpy as np
from pytrends.request import TrendReq 
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity 

In [None]:
# Fetches the missing keywords from google trends
def get_google_trends_to_csv(keyword, df, df_index, df_name = "../../data/interim/trend_single_score.csv"):
    comp_time = "2010-01-01 2022-11-19"
    pytrends = TrendReq(hl='en-US', tz=-120, timeout=(10,25), retries = 4, backoff_factor=10)
    pytrends.build_payload([keyword], timeframe=comp_time, geo = "")
    loc_df = pytrends.interest_over_time()

    if(len(loc_df) < 1):
        loc_df = pytrends.interest_over_time()
        if(len(loc_df) < 1):
            with open("../../data/interim/lift_score_unpopular_google_searches.txt", "a") as file:
                file.write(str(df_index) + ": " + keyword +"\n")
                return df, False

    loc_df = loc_df.drop(columns="isPartial")
    df = pd.concat([df, loc_df], axis=1)

    df.to_csv(df_name)

    return df, True

#Calculates the lift score = has the word been more or less trendy this month than on average within the last year
def get_lift(keyword, df, year, month, df_index):
    if keyword not in df.columns:
        print("    adding " + keyword + " to data base.")
        df, succeeded = get_google_trends_to_csv(keyword, df, df_index)
        if not succeeded:
            print("    adding failed.")
            return df, 1

    end_index = int(np.where((df.index.year ==  year) & (df.index.month == month))[0])
    start_index = int(np.where((df.index.year ==  year-1) & (df.index.month == month))[0])

    month_score = df.iloc[end_index][keyword]
    mean = df.iloc[start_index:end_index][keyword].mean()

    return df, month_score/(max(mean, 1))


    

In [None]:
# Maximal Marginal Relevance
# Returns top_n best keywords
def mmr(doc_embedding, word_embeddings, words, top_n, diversity):

    # Extract similarity within words, and between words and the document
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
    word_similarity = cosine_similarity(word_embeddings)

    # Initialize candidates and already choose best keyword/keyphras
    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        # Extract similarities within candidates and
        # between candidates and selected keywords/phrases
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # Calculate MMR
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # Update keywords & candidates
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx]

In [None]:
# Preprocesses text and call mmr
# Returns top_n keywords
def get_mmr_keywords(doc, top_n=5):

    n_gram_range = (1,1)
    count = CountVectorizer(ngram_range=n_gram_range, stop_words="english").fit([doc])
    candidates = count.get_feature_names_out()

    model = SentenceTransformer('distilbert-base-nli-mean-tokens')
    doc_embedding = model.encode([doc])
    candidate_embeddings = model.encode(candidates)

    #top_n = 10
    #distances = cosine_similarity(doc_embedding, candidate_embeddings)
    #keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
    #keywords
    if len(candidates) <= top_n:
        return candidates
     
    return mmr(doc_embedding, candidate_embeddings, candidates, top_n=top_n, diversity=0.2)


# The main loop

In [None]:
df = pd.read_csv("../../data/interim/blogs_with_analytics.csv", sep="\t", parse_dates=["time"], infer_datetime_format=True)
keyword_df = pd.read_csv("../../data/interim/trend_single_score.csv", parse_dates=["date"], infer_datetime_format=True, index_col=["date"])
# pytrends = TrendReq(hl='en-US', tz=-120, timeout=(10,25), retries = 4, backoff_factor=10)
# keyword = "Google"
# comp_time = "2010-01-01 2022-11-19"
# pytrends.build_payload([keyword], timeframe=comp_time, geo = "")
# loc_df = pytrends.interest_over_time()
# loc_df = loc_df.drop(columns="isPartial")
# keyword_df = loc_df
for index in range(1, len(df)):
    keywords = get_mmr_keywords(df.iloc[index]["text"])
    print(str(index) + ": " + str(keywords))
    
    lift_sum = 0

    for word in keywords: 
        keyword_df, lift = get_lift(word, keyword_df, int(df.iloc[index]["time"].year), int(df.iloc[index]["time"].month), index)
        lift_sum += lift
        df.at[index, "mmr_lift"] = lift_sum

    df.to_csv("../../data/interim/blogs_with_analytics.csv", sep="\t")



# Rest of the file is for debuging

In [None]:
get_mmr_keywords(df.iloc[772]["text"])

In [None]:
df.iloc[772]

In [None]:
df["mmr_lift"][0:639].info()

In [None]:
comp_time = "2010-01-01 2022-11-19"
pytrends = TrendReq(hl='en-US', tz=-120, timeout=(10,25), retries = 4, backoff_factor=10)
pytrends.build_payload(["mobprogramming"], timeframe=comp_time, geo = "")
loc_df = pytrends.interest_over_time()
loc_df

In [None]:
temp_df = pd.read_csv("../data/blogs_with_analytics.csv", sep="\t", parse_dates=["time"], infer_datetime_format=True, index_col=["index"])
#temp_df = temp_df.drop(columns=temp_df.columns[0:1])
#temp_df.to_csv("../data/blogs_with_analytics.csv", sep="\t")
temp_df

In [None]:
temp_df["mmr_lift"] = temp_df["mmr_lift"].fillna(-1.0)
temp_df.info()
temp_df.to_csv("../data/blogs_with_analytics_backup.csv", sep="\t")