In [1]:
import pandas as pd

# Data loading

In [2]:
DATA_ROOT = 'data/'
OUTPUT_ROOT = 'output/'

In [3]:
corpus = pd.read_json(DATA_ROOT + 'corpus.jsonl', lines=True).sort_values(by=["_id"]).rename(columns={"_id": "corpus-id"})
corpus.head()

Unnamed: 0,corpus-id,text
1000000,0,The presence of communication amid scientific ...
966376,8,"In June 1942, the United States Army Corps of ..."
468831,12,Tutorial: Introduction to Restorative Justice....
1000001,16,The approach is based on a theory of justice t...
306952,23,Phloem is a conductive (or vascular) tissue fo...


In [4]:
queries = pd.read_json(path_or_buf=DATA_ROOT + 'queries.jsonl', lines=True).sort_values(by=["_id"])
queries['text'] = queries['text'].str.strip()#.apply(tokenize)
queries = queries.drop(columns=["metadata"]).rename(columns={"_id": "query-id"})
queries.head()

Unnamed: 0,query-id,text
506217,2,Androgen receptor define
65864,3,Another name for the primary visual cortex is
372466,4,Defining alcoholism as a disease is associated...
326447,5,ECT is a treatment that is used for
117580,6,"Ebolavirus is an enveloped virus, which means"


In [5]:
query_corpus_train_map = pd.read_csv(DATA_ROOT + "task1_train.tsv", sep="\t")
query_corpus_train_map.sort_values(by="query-id")

Unnamed: 0,query-id,corpus-id,score
70257,3,1142680,1
395137,4,5613529,1
346352,5,4956428,1
125307,6,1931409,1
66896,8,1094214,1
...,...,...,...
169115,1185863,2545716,1
88577,1185864,1408016,1
8141,1185865,229186,1
1,1185868,16,1


In [6]:
queries_train = pd.merge(queries, query_corpus_train_map, left_on='query-id', right_on='query-id', how='inner').drop(columns=[ "score","corpus-id"])
queries_train_subset = queries_train.iloc[:7437, :]
queries_train_subset

Unnamed: 0,query-id,text
0,3,Another name for the primary visual cortex is
1,4,Defining alcoholism as a disease is associated...
2,5,ECT is a treatment that is used for
3,6,"Ebolavirus is an enveloped virus, which means"
4,8,"In humans, the normal set point for body tempe..."
...,...,...
7432,18204,anger is fear
7433,18205,anger management definition
7434,18208,angie baby meaning
7435,18209,angie lindvall


In [7]:
df_test = pd.read_csv(DATA_ROOT + "task1_test.tsv", sep="\t")
queries_test = pd.merge(queries, df_test, left_on='query-id', right_on='query-id', how='inner').drop(columns=["id"])
queries_test

Unnamed: 0,query-id,text
0,2,Androgen receptor define
1,1215,3 levels of government in canada and their res...
2,1288,3/5 of 60
3,1576,60x40 slab cost
4,2235,Bethel University was founded in what year
...,...,...
7432,1102335,why do people buy cars
7433,1102351,why do jefferson and stanton include these sim...
7434,1102390,why do children get aggressive
7435,1102393,why do celebrate st patrick's day


# Tests

### TfIdf on the corpus

In [8]:
from tfidf import *

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\balanton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\balanton\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
corpus_subset = corpus.iloc[:10]

original_documents, documents, document_vectors, vocabulary, idf = tfidf(corpus_subset.text)

### Saving results in the .csv file

In [10]:
df1 = corpus_subset.drop(columns=['text']).reset_index(drop=True)
df2 = pd.DataFrame(document_vectors, columns=vocabulary)

weights_df = df1.join(df2)
weights_df.head()

Unnamed: 0,corpus-id,1,1882,1890,1942,2,3,90,a,abnorm,...,usual,varieti,variou,vascular,vessel,victim,water,wrongdo,xylem,yellow
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8,0.0,0.0,0.0,2.302585,1.609438,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12,0.767528,0.0,0.0,0.0,0.536479,0.767528,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.302585,0.0,1.151293,0.0,0.0
4,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.151293,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
weights_df.to_csv(OUTPUT_ROOT + "corpus_weights.csv", index=False)

In [12]:
corpus_weights = pd.read_csv(OUTPUT_ROOT + "corpus_weights.csv")

In [13]:
corpus_weights.head()

Unnamed: 0,corpus-id,1,1882,1890,1942,2,3,90,a,abnorm,...,usual,varieti,variou,vascular,vessel,victim,water,wrongdo,xylem,yellow
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8,0.0,0.0,0.0,2.302585,1.609438,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12,0.767528,0.0,0.0,0.0,0.536479,0.767528,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.302585,0.0,1.151293,0.0,0.0
4,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.151293,0.0,0.0,0.0,0.0,0.0,0.0
