# Key-words screening

In [2]:
import nltk
from nltk.stem.porter import PorterStemmer
import csv
import string
from gensim.models import Phrases
from gensim.models import Word2Vec
from nltk.corpus import stopwords
import pandas as pd
import numpy as np

from collections import Counter

In [4]:
df = pd.read_csv('datasets/cleaned-datasets/socialisolation_cleaned.csv')

df_out = df[df['is_in'] == 0]
df_out.head()

Unnamed: 0,text,is_intervention,is_in
0,therapeut effect indoor garden programm older ...,1,0
1,depress modifi factor fear older faller transi...,1,0
2,factori structur measur invari pana spanish ol...,1,0
3,experienti variabl influenc play foot-clasp mo...,1,0
4,longev increas posit self-percept age research...,1,0


# Collocations

In [6]:
documents_list = []
for text in df_out['text'].values:
    doc = text.split(' ')
    documents_list.append(doc)
    
bi_gram = Phrases(documents_list)

bi_gram_counter = Counter()
for key in bi_gram.vocab.keys():
    bi_gram_counter[key] += bi_gram.vocab[key]

# # print out
# for key, counts in bi_gram_counter.most_common(1000):
#     print('{0: <30} {1}'.format(str(key), counts))

Find most common key-words-collocation

In [7]:
from nltk.text import TextCollection

documents_collection = TextCollection(df_out['text'].values)

vocabulary = []
for document in df_out['text'].values:
    document = list(bi_gram[document.split(' ')])
    vocabulary += document
vocabulary = list(set(vocabulary))

idf_vocab = {}
for word in vocabulary:
    idf_vocab[word] = documents_collection.idf(word)

s = sorted(idf_vocab.items(), key=lambda x: x[1])



# Bi-grams

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

def get_freqs(df):
    vectorizer = CountVectorizer(ngram_range=(1, 1))
    X = vectorizer.fit_transform(df.text.values)
    freqs = zip(vectorizer.get_feature_names(), (X.toarray() != 0).sum(0))
    freqs_sorted = sorted(freqs, key=lambda x: -x[1])
    k_list, f_list = [], []
    for k, f in freqs_sorted:
        k_list.append(k)
        f_list.append(f)
    
    return [k_list, f_list]


def get_kwards(kwords_freqs, top_n):
    kwords_in, kwords_in_freqs = kwords_freqs['in']
    kwords_out, kwords_out_freqs = kwords_freqs['out']
    kwords = {}
    
    # only in-scope keqywords     
    count = 0
    in_only = [[], []]
    for k, f in zip(kwords_in, kwords_in_freqs):
        if k not in kwords_out:
            in_only[0].append(k)
            in_only[1].append(f)
            
            count += 1
            if count == top_n:
                break
    kwords['in_only'] = in_only
    
    # only out-scope keqywords 
    count = 0
    out_only = [[], []]
    for k, f in zip(kwords_out, kwords_out_freqs):
        if k not in kwords_in:
            out_only[0].append(k)
            out_only[1].append(f)
            
            count += 1
            if count == top_n:
                break
    kwords['out_only'] = out_only
    
    return kwords



df = pd.read_csv('datasets/cleaned-datasets/socialisolation_cleaned.csv')

kwords_freqs = {}
kwords_freqs['in'] = get_freqs(df[df['is_in'] == 1])
kwords_freqs['out'] = get_freqs(df[df['is_in'] == 0])

top_n = 200
kwords = get_kwards(kwords_freqs, top_n)

data_output = list(zip(kwords['in_only'][0], kwords['in_only'][1], kwords['out_only'][0], kwords['out_only'][1]))
df_kwards = pd.DataFrame(data_output, columns=['kward_in', 'freqs_in', 'kward_out', 'freqs_out'])

## LDA

In [103]:
from gensim.models import LdaModel
from gensim import corpora, models
from sklearn.feature_extraction.text import CountVectorizer

df = pd.read_csv('datasets/cleaned-datasets/socialisolation_cleaned.csv')

texts = [d.split() for d in df[df['is_in'] == 0]['text'].values]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [104]:
ldamodel = LdaModel(corpus, num_topics=100, id2word = dictionary, passes=20)

  diff = np.log(self.expElogbeta)


In [109]:
for i in ldamodel.print_topics(num_topics=5, num_words=5):
    print(i[1])

0.044*"social" + 0.029*"isolation" + 0.021*"patient" + 0.021*"admission" + 0.018*"opioid"
0.023*"loneliness" + 0.015*"older" + 0.014*"group" + 0.013*"internet" + 0.010*"interest"
0.036*"model" + 0.036*"schizophrenia" + 0.032*"symptom" + 0.027*"negative" + 0.018*"behavioural"
0.031*"health" + 0.026*"people" + 0.016*"depression" + 0.015*"care" + 0.015*"older"
0.026*"gene" + 0.017*"environment" + 0.013*"regulate" + 0.013*"cancer" + 0.013*"life"
