In [1]:
import os
import re
import operator
import matplotlib.pyplot as plt
import warnings
import numpy as np
warnings.filterwarnings('ignore')  # Let's not pay heed to them right now
import spacy
import scattertext

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

%matplotlib inline

In [4]:
from gensim.utils import lemmatize
from nltk.corpus import stopwords

In [5]:
themes = ['adolescent', 'geriatric', 'mnchn', 'specpop']
theme_dict = {}
for theme in themes:
    with open('data/theme_list/'+theme+'_aos.txt', 'r') as f: 
        theme_dict[theme] = tuple([line.split(',') for line in f.readlines()][0])

In [6]:
import pandas as pd
import sqlalchemy
from sqlalchemy.dialects import postgresql as psql
from sqlalchemy import Column, Integer, String, DATE
from sqlalchemy.ext.declarative import declarative_base

Base = declarative_base()

class Document(Base):                  
    __tablename__ = 'api_document'         

    id = Column(Integer, primary_key=True)

    title = Column(psql.TEXT)          
    date = Column(DATE)                
    doctype = Column(psql.TEXT)        
    docnum = Column(psql.TEXT)         
    subject = Column(psql.TEXT)        
    body = Column(psql.TEXT)           
    sign = Column(psql.TEXT)           
    signtitle = Column(psql.TEXT)      
    images = Column(psql.JSONB)        
    raw_body = Column(psql.JSONB)      

    def __repr__(self):                
        return self.title              

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

engine = create_engine('postgresql://dev:dev@localhost/dev')
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()

In [7]:
theme_df = pd.DataFrame()

In [8]:
for theme in themes:
    query = 'SELECT body FROM api_document WHERE title in' + str(theme_dict[theme])

    df = pd.read_sql_query(query, engine)

    df.body = df.body.str.replace('\n', ' ').str.replace('\t', ' ')
    
    df['label'] = theme
    
    theme_df = theme_df.append(df)

#### LDA Training

In [9]:
stops = set(stopwords.words('english'))  # nltk stopwords list

In [10]:
def process_texts(texts):
    """
    Function to process texts. Following are the steps we take:
    
    1. Stopword Removal.
    # 2. Collocation detection.
    3. Lemmatization (not stem since stemming can reduce the interpretability).
    
    Parameters:
    ----------
    texts: Tokenized texts.
    
    Returns:
    -------
    texts: Pre-processed tokenized texts.
    """
    texts = [[word for word in line if word not in stops] for line in texts]
    # texts = [bigram[line] for line in texts]
    texts = [[word.split('/')[0] for word in lemmatize(' '.join(line), 
                    allowed_tags=re.compile('(NN)'), min_length=3)] for line in texts]
    return texts

In [11]:
theme_df['final'] = ''

In [15]:
for i, r in theme_df.iterrows():
    tmp = process_texts(r['body'])
    theme_df.set_value(i, 'final', tmp)

BadZipFile: File is not a zip file

In [14]:
import itertools
combs = list(itertools.combinations(themes, 2))

In [16]:
combs

[('adolescent', 'geriatric'),
 ('adolescent', 'mnchn'),
 ('adolescent', 'specpop'),
 ('geriatric', 'mnchn'),
 ('geriatric', 'specpop'),
 ('mnchn', 'specpop')]

In [17]:
import scattertext as st
nlp = spacy.en.English()

for c in combs:
    df = theme_df[theme_df.label.isin(c)]
    corpus = st.CorpusFromPandas(df, 
                                  category_col='label', 
                                  text_col='body',
                                  nlp=nlp).build()
    html = st.produce_scattertext_explorer(corpus,
              category=c[0],
              category_name=c[0],
              not_category_name=c[1],
              width_in_pixels=1000)
    open(c[0]+'_'+c[1]+".html", 'wb').write(html.encode('utf-8'))