# Vector Space Model

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## 1) Read data

In [2]:
# read preprocessed data in pandas dataframe
corpus = pd.read_csv('data/preprocessed_corpus_test.csv', index_col='id') # change filepath when preprocessing is done
corpus.index.name = 'page_id'

corpus.head(3)

Unnamed: 0_level_0,text
page_id,Unnamed: 1_level_1
12148915,Keith Osik Keith Richard Osik (born October 2...
16752449,Swansons Landing Texas Swansons Landing is a ...
31967453,Mike Potts Mike or Michael Potts may refer to...


In [3]:
# read preprocessed queries in pandas dataframe
queries = pd.read_csv('data/preprocessed_query_data.csv')
queries['keywords'] = queries['keywords'].str.strip('][').str.replace("'", "").str.split(', ')
queries = queries.set_index('id')
queries.index.name = 'query_id'

queries.head()

Unnamed: 0_level_0,keywords,title,rel_docs
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
84,"[cultivate, agricultural, maize, corn, fruit, ...",Agriculture,"[572, 627, 678, 903, 1193, 1542, 1634, 3751, 3..."
111,"[reptile, lizard, salamander, fossil, frog, pr...",Amphibians and Reptiles,"[621, 809, 1380, 6641, 8311, 8937, 13134, 1446..."
265,"[astronomer, astronomy, astrophysicist, mathem...",Astronomy,"[39, 308, 580, 664, 736, 748, 791, 798, 799, 1..."
323,"[aviation, airfield, airport, aerospace, aircr...",Aviation,"[849, 852, 1293, 1902, 1942, 2039, 2075, 2082,..."
396,"[actor, cast, screenwriter, filmmaker, film, a...",Biography/WikiProject Actors and Filmmakers,"[344, 676, 808, 872, 1247, 1806, 1828, 2083, 2..."


In [4]:
# read preprocessed fairness attributes data
fairness_attributes = pd.read_csv('data/preprocessed_fairness_attributes.csv', index_col='page_id')

fairness_attributes.head()

Unnamed: 0_level_0,qual_cat_B,qual_cat_C,qual_cat_FA,qual_cat_GA,qual_cat_Start,qual_cat_Stub,gender_category_Man,gender_category_Unknown,gender_category_Woman,years_category_20th century,...,source_subcont_regions_South-eastern Asia,source_subcont_regions_Caribbean,source_subcont_regions_Western Africa,source_subcont_regions_Southern Africa,source_subcont_regions_Middle Africa,source_subcont_regions_Eastern Africa,source_subcont_regions_Central Asia,source_subcont_regions_Antarctica,source_subcont_regions_Melanesia,source_subcont_regions_Micronesia
page_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
25,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
39,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
290,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
303,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


## 2) Create tf-idf vectors and measure cosine similarities

In [5]:
# tf-idf vectors of corpus
vectorizer = TfidfVectorizer()
corpus_tf_idf_vecs = vectorizer.fit_transform(corpus['text'])
corpus_words = vectorizer.get_feature_names_out()
corpus_tf_idf_df = pd.DataFrame(corpus_tf_idf_vecs.toarray(), columns=corpus_words, index=corpus.index)

corpus_tf_idf_df.head()

Unnamed: 0_level_0,17,19,22,born,clark,inventor,is,keith,landing,march,...,sett,shuker,surname,swansons,test,texas,th,to,william,with
page_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12148915,0.0,0.231394,0.231394,0.231394,0.0,0.0,0.0,0.462789,0.0,0.0,...,0.0,0.0,0.0,0.0,0.551304,0.0,0.0,0.0,0.0,0.0
16752449,0.0,0.0,0.0,0.0,0.0,0.0,0.199666,0.0,0.494962,0.0,...,0.247481,0.0,0.0,0.494962,0.589631,0.247481,0.0,0.0,0.0,0.0
31967453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.537112,0.0,0.0,0.225438,0.0,0.0
47436994,0.0,0.0,0.0,0.0,0.0,0.0,0.206077,0.0,0.0,0.0,...,0.0,0.510854,0.255427,0.0,0.608561,0.0,0.255427,0.0,0.0,0.255427
13924699,0.244877,0.0,0.0,0.0,0.489754,0.244877,0.0,0.0,0.0,0.244877,...,0.0,0.0,0.0,0.0,0.583427,0.0,0.0,0.0,0.489754,0.0


In [6]:
# measure cosine similarites

cos_sim_text = pd.DataFrame()

for idx, query in queries['keywords'].iteritems():
    # create dataframe with query terms as columns and term frequency (always 1) as value
    query_tf = pd.DataFrame(dict(zip(query, [[1] for i in query])), index=[-1])
    # concat with corpus tf-idf dataframe
    query_words_only = pd.concat([query_tf, corpus_tf_idf_df])
    # drop columns of words that don't exist in the query
    query_words_only = query_words_only.fillna(0)
    bool_vec = query_words_only.loc[-1, :].astype(bool)
    query_words_only = query_words_only.loc[:, bool_vec]
    # drop query tf column
    query_words_only = query_words_only.drop(index=[-1])
    # calculate cosine similarities
    cos_sim = cosine_similarity(query_tf, query_words_only)[0]
    cos_sim = pd.Series(cos_sim, index=query_words_only.index, name=idx)
    cos_sim_text = pd.concat([cos_sim_text, cos_sim], axis=1)
    
cos_sim_text.index.name = 'page_id'
cos_sim_text.columns.name = 'query_id'
    
cos_sim_text.head()

query_id,84,111,265,323,396,397,403,409,426,475,...,1715,1773,1970,2006,2213,2272,2365,2429,2465,2741
page_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12148915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16752449,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31967453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47436994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13924699,0.0,0.0,0.104828,0.133631,0.0,0.0,0.113961,0.0,0.0,0.118678,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 3) Create mean vector for fairness attributes and measure cosine similarities

In [7]:
mean_fairness_vec = fairness_attributes.mean()

mean_fairness_vec.head()

qual_cat_B        0.2538
qual_cat_C        0.2118
qual_cat_FA       0.0315
qual_cat_GA       0.2514
qual_cat_Start    0.1160
dtype: float64

### Option 1: Equally weighted attributes

In [8]:
cos_sim_fairness_attributes = pd.DataFrame(
    cosine_similarity(
        fairness_attributes,
        mean_fairness_vec.array.reshape(1, -1),
        dense_output=True
    ),
    columns=['cos_sim_fairness_attributes'],
    index=fairness_attributes.index
)

cos_sim_fairness_attributes.head()

Unnamed: 0_level_0,cos_sim_fairness_attributes
page_id,Unnamed: 1_level_1
12,0.96141
25,0.85106
39,0.987919
290,0.974377
303,0.755998


### Option 2: Weighting attributes to have equally weighted groups of attribute categories

In [9]:
# groups:

# qual_cat_...
# gender_category_...
# years_category_...
# num_sitelinks_category
# relative_pageviews_category_...
# page_countries_...
# page_subcont_regions_...
# occupations_...
# source_countries_...
# source_csubcont_regions_...

# test if locations on country level exists in fairness_attributes data set
if sum(['page_countries_' in col for col in fairness_attributes.columns]):
    fairness_col_cats = [
        'qual_cat_', 'gender_category_', 'years_category_',
        'num_sitelinks_category', 'relative_pageviews_category_', 'page_countries_',
        'page_subcont_regions_', 'occupations_', 'source_countries_',
        'source_csubcont_regions_'
    ]
    
else:
    fairness_col_cats = [
        'qual_cat_', 'gender_category_', 'years_category_',
        'num_sitelinks_category', 'relative_pageviews_category_',
        'page_subcont_regions_', 'occupations_', 
        'source_csubcont_regions_'
    ]
    
category_cos_sim = pd.DataFrame()
    
for fairness_col_cat in fairness_col_cats:
    columns = [col for col in fairness_attributes if fairness_col_cat in col]
    cos_sim_group = pd.DataFrame(
        cosine_similarity(
            fairness_attributes[columns],
            mean_fairness_vec[columns].array.reshape(1, -1),
            dense_output=True
        ),
        columns=[fairness_col_cat],
        index=fairness_attributes.index
    )
    category_cos_sim = pd.concat([category_cos_sim, cos_sim_group])
    
category_cos_sim.head()

ValueError: Found array with 0 feature(s) (shape=(10000, 0)) while a minimum of 1 is required by check_pairwise_arrays.

In [None]:
# mean of all categories cossine similarities
weighted_cos_sim_fairness_attributes = category_cos_sim.mean(axis=1)

weighted_cos_sim_fairness_attributes.head()

## Create rankings

In [10]:
# rank documents by:
# - the cosine similarity of the tf-idf vectors of the queries and documents text
# - the cosine similarity of their fairness attributes and the mean fairness attributes vector
# weighting of 50% each

fairness_weight = 0.5

# combine both cosine similarity values
cos_sim_combined = cos_sim_text.join(cos_sim_fairness_attributes)
query_cols = list(cos_sim_combined.columns)
query_cols.remove('cos_sim_fairness_attributes')
cos_sim_combined[query_cols] = ((1 - fairness_weight) * cos_sim_combined[query_cols]).add(
    fairness_weight * cos_sim_combined['cos_sim_fairness_attributes'],
    axis=0
)

cos_sim_combined.head()

Unnamed: 0_level_0,84,111,265,323,396,397,403,409,426,475,...,1773,1970,2006,2213,2272,2365,2429,2465,2741,cos_sim_fairness_attributes
page_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12148915,,,,,,,,,,,...,,,,,,,,,,
16752449,,,,,,,,,,,...,,,,,,,,,,
31967453,,,,,,,,,,,...,,,,,,,,,,
47436994,,,,,,,,,,,...,,,,,,,,,,
13924699,,,,,,,,,,,...,,,,,,,,,,


In [None]:
# create file containing top 500 documents for each query

# ...