In [228]:
import sys
sys.path.insert(1, '/scratch/cinthiasouza/mv-text-summarizer')
import os
import itertools
import re

import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE
import plotly.express as px

from src import preprocess
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stop_words = list(stopwords.words('english'))
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD


In [64]:
%load_ext autoreload
%autoreload 2

from src import loader
from src import extract_features

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [217]:
paths = ["../../PMC001xxxxxx_pp", "../../PMC002xxxxxx_pp","../../PMC003xxxxxx_pp", "../../PMC004xxxxxx_pp", "../../PMC005xxxxxx_pp", "../../PMC006xxxxxx_pp"]

In [281]:
def load_files(name, path):


    files = directory_things = [i for i in os.listdir(path) if i[0] != "."]
    text, files = loader.load_files(path, files)
    journal_titles = [i.get('journal_title')  for i in text]
    article_titles = [i.get('sec_title') for i in text]

    
    
    df = pd.DataFrame({"text": journal_titles, 'article_title': article_titles, 'files': files})
    df.to_csv("{}_titles.csv".format(name), index=False)
    

In [219]:
paths

['../../PMC001xxxxxx_pp',
 '../../PMC002xxxxxx_pp',
 '../../PMC003xxxxxx_pp',
 '../../PMC004xxxxxx_pp',
 '../../PMC005xxxxxx_pp',
 '../../PMC006xxxxxx_pp']

In [282]:
from multiprocessing import Process, Queue


l1 = Queue()
p1 = Process(
    target=load_files, args=('PMC001', "../../PMC001xxxxxx_pp"))

l2 = Queue()
p2 = Process(
    target=load_files, args=('PMC002', "../../PMC002xxxxxx_pp"))

l3 = Queue()
p3 = Process(
    target=load_files, args=('PMC003', "../../PMC003xxxxxx_pp"))

l4 = Queue()
p4 = Process(
    target=load_files, args=('PMC004',"../../PMC004xxxxxx_pp"))

l5 = Queue()
p5 = Process(
    target=load_files, args=('PMC005', "../../PMC005xxxxxx_pp"))

l6 = Queue()
p6 = Process(
    target=load_files, args=('PMC006',"../../PMC006xxxxxx_pp"))


p1.start()
p2.start()
p3.start()
p4.start()
p5.start()
p6.start()

In [283]:
names = ['PMC001','PMC002', 'PMC003', 'PMC004', 'PMC005', 'PMC006']
dfs = [pd.read_csv("{}_titles.csv".format(name)) for name in names]
df = pd.concat(dfs)

In [284]:
df

Unnamed: 0,text,article_title,files
0,NeuroImage,Reliable identification of the auditory thalam...,PMC1458525.json
1,Social cognitive and affective neuroscience,Adolescent development of the neural circuitry...,PMC1948845.json
2,Quarterly journal of experimental psychology (...,Asymmetry of Attentional Set in Rhesus Monkeys...,PMC1764629.json
3,Journal of personality disorders,THE EFFECTIVENESS OF COGNITIVE BEHAVIOR THERAP...,PMC1852259.json
4,BJOG : an international journal of obstetrics ...,Routines in facility-based maternity care: evi...,PMC1457116.json
...,...,...,...
13600,Clinical pediatrics,Bronchodilator Use for Acute Chest Syndrome Am...,PMC6505689.json
13601,Neurobiology of aging,Association of Telomere Length with General Co...,PMC6064381.json
13602,Hormones and behavior,Mechanisms underlying the rapid effects of est...,PMC6226372.json
13603,International journal of audiology,Costs and Effectiveness of Hearing Conservatio...,PMC6188788.json


In [44]:
def stemming(text):

    ps = PorterStemmer()
    words = text.split(" ")
    words = [ps.stem(w) for w in words  if not w in stop_words]

    return " ".join(words)

In [98]:
def plot_clusters(df, projections):
    
    fig = px.scatter(projections, x=0, y=1, color=df.label)
    fig.show()

In [46]:
def concat_sentences(df, label):
    
    try:
        return ' '.join(list(df.loc[df['articles'] == label]['sentences']))
    except:
        return " "

In [47]:
def create_df(features, labels):
    
    df = pd.DataFrame()
    texts = []
    
    for i in labels:
        texts.append(concat_sentences(features, i))
        
    df['texts'] = texts
    df['articles'] = labels
    
    return df

# TFIDF

In [262]:
def remove_noise2(text):
    
    text = re.sub(r'[0-9]+', '', text)
    text = re.sub(r'/[^a-zA-Z ]/g', '', text)
    text = re.sub(r'\b\w{1,2}\b', '', text)
    text = text.replace('journal', '')
    text = text.replace("proceedings", '')
    
    return text

In [139]:
df =  pd.DataFrame({'text': titles, 'articles': files})

In [285]:
vfunc = np.vectorize(preprocess.remove_noise)
df['text'] = vfunc(df['text'])
df['article_title'] = vfunc(df['article_title'])

In [286]:
vfunc = np.vectorize(remove_noise2)
df['text'] = vfunc(df['text'])
df['article_title'] = vfunc(df['article_title'])

In [287]:
vfunc = np.vectorize(stemming)
df['text'] = vfunc(df['text'])
df['article_title'] = vfunc(df['article_title'])

In [318]:
df['text'].str.split(' ').str.len() > 1

0        False
1         True
2         True
3         True
4         True
         ...  
13600     True
13601     True
13602     True
13603     True
13604     True
Name: text, Length: 69811, dtype: bool

In [319]:
df.loc[df['text'].str.split(' ').str.len() > 1]

Unnamed: 0,text,article_title,files
1,social cognit affect neurosci,adolesc develop neural circuitri think intent,PMC1948845.json
2,quarterli experiment psycholog,asymmetri attent set rhesu monkey learn colo...,PMC1764629.json
3,person disord,effect cognit behavior therapi borderlin pers...,PMC1852259.json
4,bjog intern obstetr gynaecolog,routin facil base matern care evid arab world,PMC1457116.json
6,archiv biochemistri biophys,near infrar multiphoton induc gener detect hy...,PMC1995038.json
...,...,...,...
13600,clinic pediatr,bronchodil use acut chest syndrom among larg p...,PMC6505689.json
13601,neurobiolog age,associ telomer length gener cognit trajectori...,PMC6064381.json
13602,hormon behavior,mechan underli rapid effect estradiol progest...,PMC6226372.json
13603,intern audiolog,cost effect hear conserv program metal manu...,PMC6188788.json


In [333]:
df['concat'] = df['text'] + ' ' + df['article_title']

In [334]:
df

Unnamed: 0,text,article_title,files,concat
0,neuroimag,reliabl identif auditori thalamu use multi mo...,PMC1458525.json,neuroimag reliabl identif auditori thalamu us...
1,social cognit affect neurosci,adolesc develop neural circuitri think intent,PMC1948845.json,social cognit affect neurosci adolesc develop ...
2,quarterli experiment psycholog,asymmetri attent set rhesu monkey learn colo...,PMC1764629.json,quarterli experiment psycholog asymmetri a...
3,person disord,effect cognit behavior therapi borderlin pers...,PMC1852259.json,person disord effect cognit behavior therap...
4,bjog intern obstetr gynaecolog,routin facil base matern care evid arab world,PMC1457116.json,bjog intern obstetr gynaecolog routin faci...
...,...,...,...,...
13600,clinic pediatr,bronchodil use acut chest syndrom among larg p...,PMC6505689.json,clinic pediatr bronchodil use acut chest syndr...
13601,neurobiolog age,associ telomer length gener cognit trajectori...,PMC6064381.json,neurobiolog age associ telomer length gener ...
13602,hormon behavior,mechan underli rapid effect estradiol progest...,PMC6226372.json,hormon behavior mechan underli rapid effect e...
13603,intern audiolog,cost effect hear conserv program metal manu...,PMC6188788.json,intern audiolog cost effect hear conserv pr...


In [335]:
vocabulary = list(itertools.chain(*[i.split(" ")for i in list(df['concat'])]))

In [336]:
vocabulary = sorted(set(vocabulary))

In [337]:
vocabulary = list(filter(None, vocabulary))

In [338]:
vec = TfidfVectorizer(stop_words="english", use_idf=True, vocabulary=vocabulary)
vec.fit(df.text.values)

TfidfVectorizer(stop_words='english',
                vocabulary=['aa', 'aaa', 'aaag', 'aaai', 'aac', 'aaem', 'aag',
                            'aai', 'aall', 'aamc', 'aaml', 'aan', 'aancreat',
                            'aaohn', 'aap', 'aapeptid', 'aapo', 'aari', 'aarp',
                            'aasap', 'aast', 'aatcc', 'aatd', 'aav', 'aavr',
                            'aba', 'ababa', 'abacavir', 'abamectin', 'abandon', ...])

In [339]:
features = vec.transform(df.text.values)

In [340]:
features_df = pd.DataFrame(features.toarray(), columns=vocabulary)

# Embeddings

In [62]:
import spacy
from collections import Counter

nlp_md = spacy.load('en_core_web_md')

In [65]:
embed = extract_features.sentence_embeddings(text, nlp_md)

# TSNE

In [75]:
tsne = TSNE(n_components=2, random_state=0, n_jobs=-1)
projections = tsne.fit_transform(features)

## DBSCAN

In [353]:
clustering = DBSCAN(eps=0.25, n_jobs=-1, metric='cosine').fit(features)
clustering.labels_

array([  0,   1,   2, ..., 858, 177, 679])

In [180]:
plot_clusters(clustering, embed)

AttributeError: 'DBSCAN' object has no attribute 'label'

In [356]:
count_dbscan = Counter(clustering.labels_)

In [357]:
count_dbscan

Counter({0: 581,
         1: 43,
         2: 24,
         3: 88,
         4: 64,
         5: 85,
         6: 38,
         7: 10,
         8: 24,
         9: 43,
         10: 91,
         -1: 4638,
         11: 19,
         12: 208,
         13: 165,
         14: 99,
         15: 359,
         16: 57,
         17: 16,
         18: 40,
         19: 62,
         20: 5,
         21: 35,
         22: 1246,
         23: 39,
         24: 165,
         25: 89,
         26: 43,
         27: 159,
         28: 34,
         29: 86,
         30: 88,
         31: 64,
         32: 11,
         33: 11,
         34: 76,
         35: 10,
         36: 9,
         37: 16,
         38: 130,
         39: 234,
         40: 76,
         41: 75,
         42: 82,
         43: 243,
         44: 345,
         45: 172,
         46: 28,
         47: 22,
         48: 15,
         49: 21,
         50: 45,
         51: 163,
         52: 28,
         53: 34,
         54: 10,
         55: 7,
         56: 354,
         5

In [348]:
df_plot = pd.DataFrame({'label': list(map(str, clustering.labels_)), 'hover': df['concat']})

In [205]:
indices = [index for index, value in enumerate(clustering.labels_) if value == 18]

In [206]:
m2 = (features_df.iloc[indices] != 0).any()
a = m2.index[m2]
print (a)

Index(['annal', 'electrocardiolog', 'holter', 'intern', 'noninva', 'offici',
       'societi'],
      dtype='object')


In [349]:
df_plot.loc[df_plot['label'] == '20']

Unnamed: 0,label,hover
25,20,neurosci offici societi neurosci regul s...
10,20,american geriatr psychiatri offici america...
48,20,neurosci development alter olivari climb fibe...
70,20,molecular psychiatri meta analysi genom wide ...
83,20,biolog psychiatri effect intraven ketamin ex...
...,...,...
13185,20,american psychiatri electroencephalograph bi...
13256,20,psychiatri research blood cadmium depress symp...
13332,20,cognit neurosci contralater delay activ inde...
13490,20,geriatr nurs achiev self manag goal among low...


### ELBOW

In [None]:
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

from yellowbrick.cluster import KElbowVisualizer

# Instantiate the clustering model and visualizer
model = KMeans()
visualizer = KElbowVisualizer(model, k=(4,12))

visualizer.fit(X)        # Fit the data to the visualizer
visualizer.show()   

## K-means

In [358]:
kmeans = KMeans(n_clusters=30, random_state=0).fit(features)
kmeans.labels_

array([23,  2, 22, ...,  6, 25,  0], dtype=int32)

In [359]:
count_km = Counter(kmeans.labels_)
count_km

Counter({23: 590,
         2: 1625,
         22: 1500,
         1: 869,
         25: 2337,
         0: 28484,
         7: 1187,
         19: 2026,
         29: 3110,
         3: 2042,
         5: 3405,
         27: 2994,
         10: 1711,
         21: 2654,
         17: 1003,
         8: 1524,
         14: 962,
         28: 804,
         4: 1459,
         18: 521,
         20: 1476,
         24: 870,
         15: 1993,
         12: 603,
         9: 682,
         6: 1654,
         16: 582,
         26: 463,
         13: 558,
         11: 123})

In [360]:
df_plot = pd.DataFrame({'label': list(map(str, kmeans.labels_)), 'hover': df['concat']})

In [112]:
indices = [index for index, value in enumerate(kmeans.labels_) if value == 3]

In [362]:
df_plot.loc[df_plot['label'] == '0']

Unnamed: 0,label,hover
5,0,atherosclerosi atorvastatin prevent hypoxia in...
12,0,visual cognit face fear effect eye gaze emot...
13,0,current biolog drosophila rassf homolog antag...
14,0,vascular pharmacolog angiogen transcriptom hu...
17,0,biochem pharmacolog gea peroxynitrit donor i...
...,...,...
13593,0,perinatolog offici california perinat asso...
13596,0,molecular biolog molecular mechan spontan n...
13597,0,hepatolog deceas pediatr donor liver current ...
13601,0,neurobiolog age associ telomer length gener ...


In [365]:
", ".join(pd.unique(df_plot.loc[df_plot['label'] == '0']))

ValueError: could not broadcast input array from shape (28484,2) into shape (28484)