In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os

  from tqdm.autonotebook import tqdm, trange


In [56]:
def calculate_cosine_similarities(df, input_text, k=3):
    # Step 1: Vectorize the text data
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['summary'].tolist())

    # Step 2: Vectorize the input text
    input_vector = vectorizer.transform([input_text])

    # Step 3: Calculate cosine similarity between the input text and each row in the DataFrame
    similarities = cosine_similarity(input_vector, tfidf_matrix).flatten()

    # Step 4: Get the indices and values of the top k most similar texts
    top_k_indices = similarities.argsort()[-k:][::-1]  # Get top k indices in descending order
    top_k_values = similarities[top_k_indices]  # Get the corresponding similarity scores

    return top_k_indices, top_k_values

def calculate_similarities_LM(df, input_text, k=3):
    # Load a pre-trained sentence transformer model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Step 1: Encode the text data into embeddings
    embeddings = model.encode(df['summary'].tolist(), convert_to_tensor=True)

    # Step 2: Encode the input text
    input_embedding = model.encode(input_text, convert_to_tensor=True)

    # Step 3: Calculate cosine similarity between the input text and each row in the DataFrame
    similarities = util.pytorch_cos_sim(input_embedding, embeddings).squeeze()
    top_k_sim = similarities.topk(k)

    # Step 4: Get the indices of the top 3 most similar texts
    top_k_indices = top_k_sim.indices.cpu().numpy()
    top_k_values = top_k_sim.values.cpu().numpy()

    return top_k_indices, top_k_values

df = pd.read_csv("src/data/globalterrorism_2020_cleaned.csv")
df.columns

Index(['Unnamed: 0', 'eventid', 'iyear', 'imonth', 'iday', 'country',
       'country_txt', 'region', 'region_txt', 'latitude', 'longitude',
       'summary', 'crit1', 'crit2', 'crit3', 'multiple', 'success', 'suicide',
       'attacktype1', 'attacktype1_txt', 'targtype1', 'targtype1_txt',
       'targsubtype1', 'targsubtype1_txt', 'corp1', 'target1', 'natlty1',
       'natlty1_txt', 'gname', 'motive', 'guncertain1', 'individual', 'nperps',
       'nperpcap', 'claimed', 'claimmode', 'claimmode_txt', 'weaptype1',
       'weaptype1_txt', 'weapsubtype1', 'weapsubtype1_txt', 'weapdetail',
       'nkill', 'nkillter', 'nwound', 'nwoundte', 'property', 'propextent',
       'propextent_txt', 'propvalue', 'ishostkid', 'nhostkid', 'nhours',
       'ndays', 'ransom', 'ransomamt', 'ransompaid', 'nreleased', 'scite1',
       'INT_LOG', 'INT_IDEO', 'INT_MISC', 'INT_ANY', 'related', 'flag'],
      dtype='object')

In [62]:
k = 5
input_text = "Denmark Omar El-Hussein"
similarity_calc = calculate_cosine_similarities #calculate_cosine_similarities, calculate_similarities_LM
df_filtered = df[df["summary"].notna()]
#df_filtered = df[(df["iyear"] == 2015)]
print(len(df_filtered))

if len(df_filtered) < 10000:
    print("Too many observations, narrow it down!")
else:
    indices, values = similarity_calc(df_filtered, input_text, k)
    top_k = df_filtered.iloc[indices]["summary"].tolist()

    # Display the IDs of the top 3 most similar rows
    print("Top k most similar:")
    for i in range(0, k):
        print(values[i])
        print(top_k[i])

143586
Top k most similar:
0.36097593463857713
02/14/2015: An assailant opened fire on a cultural center hosting a Free Speech debate in Copenhagen, Capital, Denmark. One civilian was killed and three police officers were wounded in the attack. This was one of two attacks in Copenhagen attributed to the same individual within hours of each other. No group claimed responsibility for the incident; however, sources attributed the attack to an individual, identified as Omar El-Hussein.
0.3553447234086482
02/14/2015: An assailant opened fire on a Jewish synagogue hosting a Bat Mitzvah in Copenhagen, Capital, Denmark. One civilian security guard was killed and two police officers were wounded in the attack. This was one of two attacks in Copenhagen attributed to the same individual within hours of each other. No group claimed responsibility for the incident; however, sources attributed the attack to an individual, identified as Omar El-Hussein.
0.3210043654956148
08/10/2019: An explosive dev