# Vector Space Model
## (using fairness attributes as additional dimensions)

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

### Read data

In [2]:
# read preprocessed data in pandas dataframe
corpus = pd.read_csv('data/preprocessed_corpus.csv')

corpus.head(3)

Unnamed: 0,id,text
0,12148915,Keith Osik Keith Richard Osik (born October 2...
1,16752449,Swansons Landing Texas Swansons Landing is a ...
2,31967453,Mike Potts Mike or Michael Potts may refer to...


In [3]:
# read preprocessed queries in pandas dataframe
queries = pd.read_json('data/train_topics_meta.jsonl')
queries = queries.drop(columns='URL')

queries.head()

Unnamed: 0,id,keywords,title,rel_docs
0,84,"['cultivated', 'agricultural', 'maize', 'corn'...",Agriculture,"[572, 627, 678, 903, 1193, 1542, 1634, 3751, 3..."
1,111,"['reptile', 'lizard', 'salamander', 'fossil', ...",Amphibians and Reptiles,"[621, 809, 1380, 6641, 8311, 8937, 13134, 1446..."
2,265,"['astronomer', 'astronomy', 'astrophysicist', ...",Astronomy,"[39, 308, 580, 664, 736, 748, 791, 798, 799, 1..."
3,323,"['aviation', 'airfield', 'airport', 'aerospace...",Aviation,"[849, 852, 1293, 1902, 1942, 2039, 2075, 2082,..."
4,396,"['actor', 'cast', 'screenwriter', 'filmmaker',...",Biography/WikiProject Actors and Filmmakers,"[344, 676, 808, 872, 1247, 1806, 1828, 2083, 2..."


In [None]:
# read dataset containing fairness attributes
pd.read_json('

### Create tf-idf vectors

In [4]:
# tf-idf vectors of corpus
vectorizer = TfidfVectorizer()
corpus_tf_idf_vecs = vectorizer.fit_transform(corpus['text'])
corpus_words = vectorizer.get_feature_names_out()

In [5]:
corpus_words

array(['17', '19', '22', 'born', 'clark', 'inventor', 'is', 'keith',
       'landing', 'march', 'may', 'mi', 'michael', 'mike', 'notable',
       'october', 'or', 'osik', 'people', 'potts', 'refer', 'richard',
       'sett', 'shuker', 'surname', 'swansons', 'test', 'texas', 'th',
       'to', 'william', 'with'], dtype=object)

In [6]:
corpus_tf_idf_vecs.toarray()

array([[0.        , 0.23139449, 0.23139449, 0.23139449, 0.        ,
        0.        , 0.        , 0.46278899, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.23139449, 0.        , 0.46278899, 0.        , 0.        ,
        0.        , 0.23139449, 0.        , 0.        , 0.        ,
        0.        , 0.55130358, 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.19966624, 0.        , 0.49496233, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.24748116, 0.        , 0.        ,
        0.49496233, 0.5896305 , 0.24748116, 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.    

In [7]:
# tf-idf vectors of corpus
queries_tf_idf_vecs = vectorizer.transform(queries['keywords'])

In [11]:
queries['keywords'][0]

"['cultivated', 'agricultural', 'maize', 'corn', 'fruit', 'wheat', 'agriculture', 'cultivar', 'olive', 'livestock', 'mammal', 'honeybee', 'potato', 'poultry', 'solanum', 'bee', 'cornbread', 'beekeeping', 'honeycrop', 'cherimoya', 'potatoe', 'oleaceae', 'beekeeper', 'cherimola', 'pig', 'honey',  'cornstarch', 'beehive', 'beeline', 'porcine', 'beewolf', 'cornmeal', 'corncob', 'beeswax', 'sugarcane', 'maizegdb', 'cereal', 'chirimuya', 'cattle', 'grain', 'honeycomb', 'umbonata', 'oleae', 'annonaceae', 'hive', 'buffalo', 'chicken', 'sorghum', 'rice',  'cornstalk', 'popcorn', 'strawberry', 'nectar', 'flour', 'plumage',  'rodent', 'vegetable', 'breeding', 'fowl', 'solanaceae', 'cavy', 'olea', 'mammalian', 'pear', 'nutrition', 'hamster', 'cavia',  'banana', 'buffalopedia', 'caviae',  'rabbit', 'polyphenols', 'river', 'botanical', 'citrus', 'polyphenol', 'bubalus', 'freshwater', 'mellifera', 'hen',  'breed', 'insect', 'apidae',  'planting',  'pollen', 'amnuaydechkorn',  'kapi', 'soybean', 'prod

In [22]:
pd.DataFrame(queries_tf_idf_vecs.toarray(), columns=corpus_words)

Unnamed: 0,17,19,22,born,clark,inventor,is,keith,landing,march,...,sett,shuker,surname,swansons,test,texas,th,to,william,with
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
