In [1]:
import gensim.downloader as gensim_api
import string
import pandas as pd
import json
import re
import os
import pyterrier as pt
from pyterrier.measures import RR, R, Rprec, P, MAP
import global_variables as gb

if not pt.started():
    print("Enabling PRF in pyterier")
    # In this lab, we need to specify that we start PyTerrier with PRF enabled
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

Enabling PRF in pyterier


PyTerrier 0.8.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [2]:

def remove_emoji_smileys(text):
    try:
        # UCS-4
        EMOJIS_PATTERN = re.compile(
            u"([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])"
        )
    except re.error:
        # UCS-2
        EMOJIS_PATTERN = re.compile(
            u"([\u2600-\u27BF])|([\uD83C][\uDF00-\uDFFF])|([\uD83D][\uDC00-\uDE4F])|([\uD83D][\uDE80-\uDEFF])"
        )

    SMILEYS_PATTERN = re.compile(r"(\s?:X|:|;|=)(?:-)?(?:\)+|\(|O|D|P|S|\\|\/\s){1,}", re.IGNORECASE)

    text = SMILEYS_PATTERN.sub(r"", text)
    text = EMOJIS_PATTERN.sub(r"", text)
    return text


def clean(text):
    text = re.sub(r"http\S+", " ", text)  # remove urls
    text = re.sub(r"RT ", " ", text)  # remove rt
    text = re.sub(r"@[\w]*", " ", text)  # remove handles
    text = re.sub(r"[\.\,\#_\|\:\?\?\/\=]", " ", text) # remove special characters
    text = re.sub(r"\t", " ", text)  # remove tabs
    text = re.sub(r"\n", " ", text)  # remove line jump
    text = re.sub(r"\s+", " ", text)  # remove extra white space
    text = text.strip()
    return text


# read file based on its extension (tsv or xlsx)
def read_file(input_file, sep="\t", names = ""):
    if input_file.endswith(".xlsx"):
        df = pd.read_excel(input_file)
    else:
        if names != "":
            df = pd.read_csv(input_file, sep=sep, names=names,encoding="utf-8")
        else:
            df = pd.read_csv(input_file, sep=sep,encoding="utf-8")
    return df


In [3]:
model_glove_twitter_25 = gensim_api.load("glove-twitter-25")
# model_glove_twitter_100 = gensim_api.load("glove-twitter-100")
# model_glove_google_300 = gensim_api.load("word2vec-google-news-300")
# word2vec_model = model_glove_google_300
word2vec_model = model_glove_twitter_25

In [13]:

import numpy as np

def get_mean_vector(text, w2v_model):
    tokens = [t for t in text.lower().split() if t in w2v_model.key_to_index]
    if len(tokens) == 0:
        print("Error: no tokens were found in the model vocabulary for this text ", text)
        return {}
    
    mean_vector = w2v_model.get_mean_vector(tokens, pre_normalize=True)
    return mean_vector, tokens


def get_query_docs_features(w2v_model, query, query_id, docs, doc_ids, labels):

    SEP_TOKEN = 256 
    query_mean_vector, query_tokens = get_mean_vector(query, w2v_model)
    df_query = pd.DataFrame(columns=gb.input_column_names)

    for i in range(len(docs)):
        df_one = pd.DataFrame(columns=gb.input_column_names)
        all_tokens_vector = []
        doc = docs[i]
        doc_mean_vector, doc_tokens = get_mean_vector(doc, w2v_model)
        cosine_sim =w2v_model.n_similarity(query_tokens, doc_tokens)
        
        # query + doc = 50 tokens
        all_tokens_vector 
def get_mean_vector(text, w2v_model):
    tokens = [t for t in text.lower().split() if t in w2v_model.key_to_index]
    if len(tokens) == 0:
        print("Error: no tokens were found in the model vocabulary for this text ", text)
        return {}
    
    mean_vector = w2v_model.get_mean_vector(tokens, pre_normalize=True)
    return mean_vector, tokens


def get_query_docs_features(w2v_model, query, query_id, docs, doc_ids, labels):

    SEP_TOKEN = 256 
    query_mean_vector, query_tokens = get_mean_vector(query, w2v_model)
    df_query = pd.DataFrame(columns=gb.input_column_names)

    for i in range(len(docs)):
        df_one = pd.DataFrame(columns=gb.input_column_names)
        doc = docs[i]
        doc_mean_vector, doc_tokens = get_mean_vector(doc, w2v_model)
        cosine_sim =w2v_model.n_similarity(query_tokens, doc_tokens)
        
        # query + doc = 50 tokens
        all_tokens_vector = np.concatenate((query_mean_vector, doc_mean_vector)).tolist()

        # sep + query + sep + doc + sep + cosine-sim + sep = 55
        # all_tokens_vector.append(SEP_TOKEN)
        # all_tokens_vector.append(query_mean_vector)
        # all_tokens_vector.append(SEP_TOKEN)
        # all_tokens_vector.append(doc_mean_vector)
        # all_tokens_vector.append(SEP_TOKEN)
        # all_tokens_vector.append(cosine_sim)
        # all_tokens_vector.append(SEP_TOKEN)

        one_row = {
            gb.QUERY_ID : query_id,
            gb.DOCID : doc_ids[i],
            gb.FEATURE : all_tokens_vector,
            gb.FLAG : labels[i],
        }
        

        df_query = df_query.append(one_row, ignore_index=True)
    

    return df_query



def create_data(input_data, output_path, w2v_model=model_glove_twitter_25):
    # columns = 'tweet_id	tweet_text	vclaim_id	vclaim	label	rank	score	title	lexical_similarity	semantic_similarity'

    df_input = read_file(input_data)
    df_res = pd.DataFrame(columns=gb.input_column_names)

    for query_id in df_input['tweet_id'].unique():
        df_query = df_input[df_input['tweet_id'] == query_id]
        query = df_query['tweet_text'].values[0]
        docs = df_query['vclaim'].values
        doc_ids = df_query['vclaim_id'].values
        labels = df_query['label'].values

        df_formatted = get_query_docs_features(w2v_model, query, query_id, docs, doc_ids, labels)
        df_res = df_res.append(df_formatted, ignore_index=True)

    
    df_res[gb.FEATURE] = df_res[gb.FEATURE].astype(str).str.strip('[|]') # to remove brackets before writing to csv 
    df_res.to_csv(output_path, sep='\t',  header=False,  index=False)
    print("Data was formatted and saved successfully in ", output_path)



train_set = './pre_prosess/VCR22/en-clef2022-train_set_top_10.tsv'
val_set = './pre_prosess/VCR22/en-clef2022-mono_bert_dev_set_top_10.tsv'
test_set = './pre_prosess/VCR22/mono_bert_test_set_top_10.tsv'

train_output = './pre_prosess/VCR22/en-2022-train-formatted.tsv'
val_output = './pre_prosess/VCR22/en-2022-validation-formatted.tsv'
test_output = './pre_prosess/VCR22/en-2022-test-formatted.tsv'

create_data(val_set, val_output)
create_data(test_set, test_output)
create_data(train_set, train_output)



Data was formatted and saved successfully in  ./pre_prosess/VCR22/en-2022-validation-formatted.tsv
Data was formatted and saved successfully in  ./pre_prosess/VCR22/en-2022-test-formatted.tsv
Data was formatted and saved successfully in  ./pre_prosess/VCR22/en-2022-train-formatted.tsv
