In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import pandas as pd

In [2]:
def bag_of_words(texts):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(texts)
    return X.toarray()



In [3]:
def n_grams(texts, n=2):
    vectorizer = CountVectorizer(ngram_range=(n, n))
    X = vectorizer.fit_transform(texts)
    return X.toarray()



In [4]:
def tfidf(texts):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(texts)
    return X.toarray()



In [52]:
def lda_encoding(texts, n_topics=5):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(texts)
    lda_model = LatentDirichletAllocation(n_components=n_topics)
    lda_features = lda_model.fit_transform(X)
    return lda_features

In [16]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

def word2vec_encoding(texts, vector_size=100, window=5, min_count=1):

    # Tokenize documents
    tokenized_texts = [simple_preprocess(doc) for doc in texts]
    
    # Train Word2Vec model
    model = Word2Vec(
        sentences=tokenized_texts,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=4
    )
    
    # Create document vectors by averaging word vectors
    doc_vectors = []
    for tokens in tokenized_texts:
        vectors = [model.wv[word] for word in tokens if word in model.wv]
        if len(vectors) > 0:
            doc_vectors.append(np.mean(vectors, axis=0))
        else:
            doc_vectors.append(np.zeros(vector_size))
    
    return np.array(doc_vectors), model

In [17]:
def load_df_from_json(json_file_path):
    try:
        df = pd.read_json(json_file_path)
        return df
    except Exception as e:
        print(e)
        return []


In [22]:
d1 = load_df_from_json('./Cleaned_data/cleaned-dataset1.json')
print(d1.columns)

Index(['Article Title', 'Abstract', 'Label'], dtype='object')
0    gene expression analysis
Name: Label, dtype: object


In [53]:

def apply_bow(df, text_column):
    texts = df[text_column].tolist()
    bow_matrix = bag_of_words(texts)
    bow_df = pd.DataFrame(bow_matrix, columns=[f'bow_{i}' for i in range(bow_matrix.shape[1])])
    return pd.concat([df, bow_df], axis=1)


def apply_ngrams(df, text_column, n=2):
    texts = df[text_column].tolist()
    ngrams_matrix = n_grams(texts, n)
    ngrams_df = pd.DataFrame(ngrams_matrix, columns=[f'ngram_{i}' for i in range(ngrams_matrix.shape[1])])
    return pd.concat([df, ngrams_df], axis=1)


def apply_tfidf(df, text_column):
    texts = df[text_column].tolist()
    tfidf_matrix = tfidf(texts)
    tfidf_df = pd.DataFrame(tfidf_matrix, columns=[f'tfidf_{i}' for i in range(tfidf_matrix.shape[1])])
    return pd.concat([df, tfidf_df], axis=1)


def apply_lda(df, text_column, n_topics=5):
    texts = df[text_column].tolist()
    lda_matrix = lda_encoding(texts, n_topics)
    lda_df = pd.DataFrame(lda_matrix, columns=[f'lda_topic_{i}' for i in range(n_topics)])
    return pd.concat([df, lda_df], axis=1)


def apply_word2vec(df, text_column, vector_size=100, window=5, min_count=1):
    texts = df[text_column].tolist()
    w2v_matrix = word2vec_encoding(texts, vector_size, window, min_count)
    w2v_df = pd.DataFrame(w2v_matrix, columns=[f'w2v_{i}' for i in range(vector_size)])
    return pd.concat([df, w2v_df], axis=1)

In [25]:
def collate_files(arr_files):
    arr_dfs = []
    for f in arr_files:
        d = load_df_from_json(f)
        arr_dfs.append(d)
    return pd.concat(arr_dfs, ignore_index=True)

In [32]:
d = collate_files(['./Cleaned_data/cleaned-dataset1.json'
               ,'./Cleaned_data/cleaned-dataset2.json'
               ,'./Cleaned_data/cleaned-dataset3.json'
               ,'./Cleaned_data/cleaned-dataset4.json'
               ,'./Cleaned_data/cleaned-dataset5.json'])
print(d['Label'].drop_duplicates())


0                       gene expression analysis
200        sequence classification and alignment
400    protein structure and function prediction
600                    biological image analysis
800                   disease outcome prediction
Name: Label, dtype: object


In [42]:
def drop_originals(df):
    df['num_label'] = pd.factorize(df['Label'])[0]
    df = df.drop('Label', axis=1)
    df = df.drop('Abstract', axis=1)
    df = df.drop('Article Title', axis=1)
    return df

In [55]:
bowd = drop_originals(apply_bow(d, 'Abstract'))
bowd.to_pickle('bow.pkl')

ngrd = drop_originals(apply_ngrams(d, 'Abstract'))
ngrd.to_pickle('ngr.pkl')

tfid = drop_originals(apply_tfidf(d, 'Abstract'))
tfid.to_pickle('tfi.pkl')

ldad = drop_originals(apply_lda(d, 'Abstract'))
ldad.to_pickle('lda.pkl')

