# Vector Space Model

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os

## 1) Read data

In [2]:
# read preprocessed queries in pandas dataframe
queries = pd.read_csv('data/preprocessed_query_data.csv')
queries['keywords'] = queries['keywords'].str.strip('][').str.replace("'", "").str.split(', ')
queries = queries.set_index('id')
queries.index.name = 'query_id'

queries.head()

Unnamed: 0_level_0,keywords,title,rel_docs
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
84,"[cultiv, agricultur, maiz, corn, fruit, wheat,...",Agriculture,"[572, 627, 678, 903, 1193, 1542, 1634, 3751, 3..."
111,"[reptil, lizard, salamand, fossil, frog, prehi...",Amphibians and Reptiles,"[621, 809, 1380, 6641, 8311, 8937, 13134, 1446..."
265,"[astronom, astronomi, astrophysicist, mathemat...",Astronomy,"[39, 308, 580, 664, 736, 748, 791, 798, 799, 1..."
323,"[aviat, airfield, airport, aerospac, aircraft,...",Aviation,"[849, 852, 1293, 1902, 1942, 2039, 2075, 2082,..."
396,"[actor, cast, screenwrit, filmmak, film, actre...",Biography/WikiProject Actors and Filmmakers,"[344, 676, 808, 872, 1247, 1806, 1828, 2083, 2..."


In [3]:
# read preprocessed fairness attributes data
fairness_attributes = pd.read_csv('data/preprocessed_fairness_attributes_small.csv', index_col='page_id')

fairness_attributes.head()

Unnamed: 0_level_0,Unnamed: 0,qual_cat_B,qual_cat_C,qual_cat_FA,qual_cat_GA,qual_cat_Start,qual_cat_Stub,gender_category_Man,gender_category_Non-binary,gender_category_Unknown,...,source_subcont_regions_South-eastern Asia,source_subcont_regions_Caribbean,source_subcont_regions_Western Africa,source_subcont_regions_Southern Africa,source_subcont_regions_Middle Africa,source_subcont_regions_Eastern Africa,source_subcont_regions_Central Asia,source_subcont_regions_Antarctica,source_subcont_regions_Melanesia,source_subcont_regions_Micronesia
page_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
25,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
39,2,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
290,3,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
303,4,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


## 2) Read tf-idf vectors and measure cosine similarities

In [4]:
tfidf_doc_filtered.head()

NameError: name 'tfidf_doc_filtered' is not defined

In [None]:
tfidf_queries.head()

In [None]:
tfidf_queries = pd.read_csv('data/tfidf_queries.csv', index_col='query_id')

# read logfile to look for last successfully processed chunk
logfile_path = 'data/vsm_chunks_log.txt'
try:
    f = open(logfile_path, "r")
    last_chunk_idx = int(f.read())
    f.close()
except FileNotFoundError:
    last_chunk_idx = -1
    
tfidf_doc_iterator = pd.read_csv('data/tfidf_doc.csv', index_col=[0], chunksize=100000)
for i, tfidf_doc in enumerate(tfidf_doc_iterator):
    
    if i <= last_chunk_idx:
        continue
    
    print(f'chunk {i+1} / 65')
    
    tfidf_doc.index.name = 'page_id'
    # measure cosine similarities for each query
    for query_id, query_tfidf in tfidf_queries.iterrows():
        
        # only use terms that appear in the query
        query_tfidf = query_tfidf[(query_tfidf != -999999) & ~query_tfidf.isna() & (query_tfidf != np.inf) & (query_tfidf != -np.inf)]
        tfidf_doc_filtered = tfidf_doc[list(query_tfidf.index)]
        
        # cosine similarity
        cos_sim = cosine_similarity(query_tfidf.array.reshape(1, -1), tfidf_doc_filtered.fillna(0))[0]
        cos_sim = pd.Series(cos_sim, index=tfidf_doc_filtered.index, name=query_id)
        
        # save to file
        filepath = f'data/vsm/vsm_{query_id}.csv'
        # Set writing mode to append after first chunk
        mode = 'w' if i == 0 else 'a'
        # Add header if it is the first chunk
        header = i == 0
        cos_sim.to_csv(filepath,
                       header=header,
                       mode=mode)
        
    # write chunk index to log file
    f = open(logfile_path, "w")
    f.write(str(i))
    f.close()

## 3) Create mean vector for fairness attributes and measure cosine similarities

In [None]:
mean_fairness_vec = fairness_attributes.mean()

mean_fairness_vec.head()

### Option 1: Equally weighted attributes

In [None]:
cos_sim_fairness_attributes = pd.DataFrame(
    cosine_similarity(
        fairness_attributes,
        mean_fairness_vec.array.reshape(1, -1),
        dense_output=True
    ),
    columns=['cos_sim_fairness_attributes'],
    index=fairness_attributes.index
)

cos_sim_fairness_attributes.head()

### Option 2: Weighting attributes to have equally weighted groups of attribute categories

In [None]:
# groups:

# qual_cat_...
# gender_category_...
# years_category_...
# num_sitelinks_category
# relative_pageviews_category_...
# page_countries_...
# page_subcont_regions_...
# occupations_...
# source_countries_...
# source_csubcont_regions_...

# test if locations on country level exists in fairness_attributes data set
if sum(['page_countries_' in col for col in fairness_attributes.columns]):
    fairness_col_cats = [
        'qual_cat_', 'gender_category_', 'years_category_',
        'num_sitelinks_category', 'relative_pageviews_category_', 'page_countries_',
        'page_subcont_regions_', 'occupations_', 'source_countries_',
        'source_subcont_regions_'
    ]
    
else:
    fairness_col_cats = [
        'qual_cat_', 'gender_category_', 'years_category_',
        'num_sitelinks_category', 'relative_pageviews_category_',
        'page_subcont_regions_', 'occupations_', 
        'source_subcont_regions_'
    ]
    
category_cos_sim = pd.DataFrame()
    
for fairness_col_cat in fairness_col_cats:
    columns = [col for col in fairness_attributes if fairness_col_cat in col]
    cos_sim_group = pd.DataFrame(
        cosine_similarity(
            fairness_attributes[columns],
            mean_fairness_vec[columns].array.reshape(1, -1),
            dense_output=True
        ),
        columns=[fairness_col_cat],
        index=fairness_attributes.index
    )
    category_cos_sim = pd.concat([category_cos_sim, cos_sim_group], axis=1)
    
category_cos_sim.head()

In [None]:
# mean of all categories cossine similarities
weighted_cos_sim_fairness_attributes = category_cos_sim.mean(axis=1)
weighted_cos_sim_fairness_attributes.name = 'fairness_cos_sim'

weighted_cos_sim_fairness_attributes.head()

## Create rankings

In [None]:
scores_combined

In [None]:
vsm_scores = pd.read_csv('data/vsm/vsm_1055.csv', index_col='page_id').sort_values(by='1055', ascending=False)
vsm_scores.columns = ['vsm_cos_sim']

scores_combined = vsm_scores.join(weighted_cos_sim_fairness_attributes)
scores_combined['comb_score'] = ((1 - fairness_weight) * scores_combined['vsm_cos_sim']).add(
    fairness_weight * scores_combined['fairness_cos_sim'],
    axis=0
)

scores_combined.sort_values('comb_score', ascending=False).iloc[:500, :]

In [None]:
# rank documents by:
# - the cosine similarity of the tf-idf vectors of the queries and documents text
# - the cosine similarity of their fairness attributes and the mean fairness attributes vector
# weighting of 50% each

fairness_weight = 0.5

vsm_score_files = os.listdir('data/vsm')

for file in vsm_score_files:
    print(file)
    vsm_scores = pd.read_csv('data/vsm/' + file, index_col='page_id')
    vsm_scores.columns = ['vsm_cos_sim']
    # combine with fairness attributes cosine similarity
    scores_combined = vsm_scores.join(weighted_cos_sim_fairness_attributes)
    scores_combined['comb_score'] = ((1 - fairness_weight) * scores_combined['vsm_cos_sim']).add(
        fairness_weight * scores_combined['fairness_cos_sim'],
        axis=0
    )
    # get 500 best documents
    best_500 = scores_combined.sort_values('comb_score', ascending=False).iloc[:500, :]
    
    # save to file
    filepath = f'data/rankings/{file}'
    best_500.to_csv(filepath)