In [1]:
import codecs
compressor = 'zlib'
import io
import numpy as np

def adapt_array(arr):
    """
    http://stackoverflow.com/a/31312102/190597 (SoulNibbler)
    """
    # zlib uses similar disk size that Matlab v5 .mat files
    # bz2 compress 4 times zlib, but storing process is 20 times slower.
    out = io.BytesIO()
    np.save(out, arr)
    out.seek(0)
    return sqlite3.Binary(codecs.encode(out.read(),compressor))  # zlib, bz2

def convert_array(text):
    out = io.BytesIO(text)
    out.seek(0)
    out = io.BytesIO(codecs.decode(out.read(),compressor))
    return np.load(out)

In [12]:
import sqlite3
def get_text(id):
    conn = sqlite3.connect('../swarog.sqlite')
    c = conn.cursor()
    c.execute("SELECT body from raw where rowid = ?" , [id+1])
    rows = c.fetchall()
    return rows[0][0]

# BERT annoy index

In [17]:
import sqlite3
from tqdm import tqdm
from annoy import AnnoyIndex

conn = sqlite3.connect('../swarog.sqlite')
c = conn.cursor()

t = AnnoyIndex(768, 'angular')

c.execute("SELECT MAX(ROWID) as total from bertnp")
rows = c.fetchall()
print(rows[0][0])

c.execute("SELECT ROWID, vec from bertnp")

print("indexing...")
for row in tqdm(c,total=rows[0][0]):
    _id, _vec = row[0], convert_array(row[1])
    t.add_item(_id, _vec)
print("building model...")

t.build(100)
t.save('swarog.ann')

conn.commit()
conn.close()

186477
indexing...


100%|█████████████████████████████████| 186477/186477 [00:40<00:00, 4631.98it/s]


building model...


In [2]:
from annoy import AnnoyIndex
t = AnnoyIndex(768, 'angular')
t.load('swarog.ann') # super fast, will just mmap the file

True

In [17]:
t.get_nns_by_item(150000, 10, search_k=-1, include_distances=True)

([150000, 151602, 162433, 165491, 13793, 16095, 93860, 96659, 161544, 174588],
 [0.0,
  0.23311839997768402,
  0.23311839997768402,
  0.23311839997768402,
  0.28941047191619873,
  0.2912265658378601,
  0.2951999008655548,
  0.2951999008655548,
  0.2982422411441803,
  0.30512621998786926])

# TF-IDF extract

In [1]:
import pandas as pd
import sqlite3

# Read sqlite query results into a pandas DataFrame
con = sqlite3.connect("../../pickles/swarog_data/swarog.sqlite")
df = pd.read_sql_query("SELECT rowid,dataset, id, body from raw where body is not null", con)

# Verify that result of SQL query is stored in the dataframe
print(df.head())

con.close()

   rowid          dataset  id  \
0      1  covid_fake_news   0   
1      2  covid_fake_news   1   
2      3  covid_fake_news   2   
3      4  covid_fake_news   3   
4      5  covid_fake_news   4   

                                                body  
0  A post claims compulsory vacination violates t...  
1  A photo claims that this person is a doctor wh...  
2  Post about a video claims that it is a protest...  
3  All deaths by respiratory failure and pneumoni...  
4  The dean of the College of Biologists of Euska...  


In [42]:
df.head()

Unnamed: 0,rowid,dataset,id,body
0,1,covid_fake_news,0,A post claims compulsory vacination violates t...
1,2,covid_fake_news,1,A photo claims that this person is a doctor wh...
2,3,covid_fake_news,2,Post about a video claims that it is a protest...
3,4,covid_fake_news,3,All deaths by respiratory failure and pneumoni...
4,5,covid_fake_news,4,The dean of the College of Biologists of Euska...


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords

# Download stopwords list
nltk.download('wordnet')
nltk.download('punkt')
stop_words = set(stopwords.words('english')) 

# Interface lemma tokenizer from nltk with sklearn
class LemmaTokenizer:
    ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`', "'"]
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if t not in self.ignore_tokens]

# Lemmatize the stop words
tokenizer=LemmaTokenizer()
token_stop = tokenizer(' '.join(stop_words))


# Create TF-idf model
vectorizer = TfidfVectorizer(stop_words=token_stop)



doc_vectors = vectorizer.fit_transform(df['body'])

# # Calculate similarity
# cosine_similarities = linear_kernel(doc_vectors[0:1], doc_vectors).flatten()
# document_scores = [item.item() for item in cosine_similarities[1:]]
# # [0.0, 0.287]

# document_scores

[nltk_data] Downloading package wordnet to /home/rkozik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/rkozik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
doc_vectors[0]


<1x503456 sparse matrix of type '<class 'numpy.float64'>'
	with 21 stored elements in Compressed Sparse Row format>

# TF-IDF to SQL

In [109]:
import sqlite3
conn = sqlite3.connect('../swarog.sqlite')
c = conn.cursor()
c.execute('''CREATE TABLE stfidf
             (dataset TEXT, gid INT, did INT, vec BLOB)''')
conn.commit()
conn.close()

In [8]:
def sparsify(c):
    _d =  [x[1]  for x in c.todok().items()]
    _xy = [x[0][0]  for x in c.todok().items()], [x[0][1]  for x in c.todok().items()]
    return _d, _xy

In [6]:
sparsify(doc_vectors[100])

NameError: name 'sparsify' is not defined

In [57]:
import pickle
# SAVE
with open('swarog_tfidf.pickle', 'wb') as handle:
    pickle.dump(doc_vectors, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [4]:
import pickle
# SAVE
with open('tfidf_vectorizer_full.pickle', 'wb') as handle:
    pickle.dump(vectorizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [124]:
import pickle
# LOAD
with open('tfidf.pickle', 'rb') as handle:
    hand = pickle.load(handle)

In [110]:
from tqdm import tqdm
conn = sqlite3.connect('../swarog.sqlite')
c = conn.cursor()
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    vec = doc_vectors[index].toarray()[0]
    values_to_insert = [(row['dataset'], row['rowid'], index, adapt_array(vec))]
    c.executemany("""INSERT INTO tfidf(dataset, gid, did, vec) VALUES (?,?,?,?)""", values_to_insert)
    conn.commit()
conn.close()

100%|██████████████████████████████████████████████████████████████████████████| 185460/185460 [47:51<00:00, 64.58it/s]


# TF-IDF create index

In [56]:
import sqlite3
from tqdm import tqdm
from annoy import AnnoyIndex
import numpy as np

t = AnnoyIndex(500, 'angular')

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    vec = doc_vectors[index].toarray()[0]
    _vec=np.array(vec)
    _vec[np.where(vec > 0)]=1
    t.add_item(index, list(_vec))
    
t.build(50)
t.save('swarog_tfidf.ann')



100%|█████████████████████████████████| 185460/185460 [00:26<00:00, 7033.50it/s]


True

In [34]:
t.save('swarog_tfidf.ann')


True

In [50]:
vec = doc_vectors[0].toarray()[0]
_vec=np.array(vec)
_vec[np.where(vec > 0)]=1
np.where(_vec>0)

(array([1293, 1602, 1643, 2382, 2477, 3262, 3897, 4793, 4799, 4885, 5238,
        5372, 6306, 6667, 6719]),)

In [53]:
df.iloc[0]['body']

'A post claims compulsory vacination violates the principles of bioethics, that coronavirus doesnâ€™t exist, that the PCR test returns many false positives, and that influenza vaccine is related to COVID-19.'