In [1]:
#ALWAYS RUN FIRST!

#Import libraries and packages. Don't worry about the warning if running it on windows, so far not hit an issue. (yn)

import re
from pylab import *
import csv
import psycopg2
import spacy
spacy.load('en')
from spacy.lang.en import English
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora
import pickle
import gensim
import os

#Set the names of the files in which you want to save data.

profile_data_name = 'profile_data'



In [2]:
#Extract the data from the CRM. Don't run if you have an up to date copy of the CRM.

try:
    
    #Opens connection to the CRM asks for peoples name and description.
    #Output rows for row in rows row[0] - first name, row[1] - second name, row[2] - description.
    
    print("Trying to access CRM CSaP database ...")
    
    conn = psycopg2.connect(SERVER_INFO)
    
    cur = conn.cursor()
    
    cur.execute("""SELECT
    person.first_name,
    person.last_name,
    person.description
    FROM people_person as person
    ;
    """)
    rows = cur.fetchall()
    
    #Saves data to the file called above.
    
    with open(os.getcwd() + '\data\\' + profile_data_name + '.csv','w+',newline ='') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        for row in rows:
            wr.writerow([r.encode('utf-8') for r in row])
    
    print("... data downloaded and saved to disk.")
    
except:
    
    #If server isn't online this collects data from the save file.
    
    print("... can't access server, is the tunnel set up? Can continue on previously saved data.")

Trying to access CRM CSaP database ...
... data downloaded and saved to disk.


In [3]:
#Functions for cleaning the text and getting it ready to be procced.

#Gets rid of HTML tags and end of line markers.
#Input: String of text from CRM or internet.
#Output: Cleaned up string of text without HTML tags or end of line markers.

def clean_text(text):
    
    #Removes HTML tags.
    
    clean = re.compile('<.*?>')
    temp_text = re.sub(clean, '', text)
    
    #Removes rouge utf-8 code.
    
    clean = re.compile('\\\\x\w\w')
    temp_text = re.sub(clean, '', temp_text)
    
    clean = re.compile('\\\\x\w')
    temp_text = re.sub(clean, '', temp_text)
    
    #Removes end of line indicators and other junk.
    
    tags = ['\\r','\\n','/','\\t','\\']
    
    for tag in tags:
        temp_text = temp_text.replace(tag,'')
    
    return temp_text

#Tokenizes text, seperates it into a string of words and grammar.
#Input: A string of text.
#Output: A list of words and grammar in order all in lower case.

parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

#Lemmatiser, this finds the root word (i.e. depluralises).
#Input: a token, i.e. a single word or grammar.
#Output: a lemma which is the base of the word and association 

def get_lemma(word):
    return WordNetLemmatizer().lemmatize(word)

#Checks if two words are synonyms or antonyms.
#Input: Takes two strings word_1 and word_2, they should be prepared words.
#Output: Returns 1 if they are synonyms, -1 if they are antonyms or 0 otherwise.

def is_synonym_antonym(word_1,word_2):
    word_1_synonyms = set([word for synset in wn.synsets(word_1) for word in synset.lemma_names()])
    word_2_synonyms = set([word for synset in wn.synsets(word_2) for word in synset.lemma_names()])
    word_1_antonyms = set([ant.name() for synset in wn.synsets(word_1) for syn in synset.lemmas() for ant in syn.antonyms()])
    word_2_antonyms = set([ant.name() for synset in wn.synsets(word_2) for syn in synset.lemmas() for ant in syn.antonyms()])

    if word_1_synonyms & word_2_synonyms or word_1_antonyms & word_2_antonyms:
        return 1

    if word_1_synonyms & word_2_antonyms or word_2_synonyms & word_1_antonyms:
        return -1
    
    return 0
    
#Prepares text for the analysis, tokenizes texts, gets rid of words length less than 4 and filters out non-useful words then
#Lemmatisers the text.
#Input: A string of text you want to analysis.
#Output: A list of Lemmas of the meaningful words.

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [tok[0] for tok in nltk.pos_tag(tokens) if tok[1][0] == 'N']
    tokens = [token for token in tokens if len(token) > 4]
    en_stop = set(nltk.corpus.stopwords.words('english'))
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [4]:
#This uploads the data into the program from the file, cleaning the data whilst it does it.

print("Trying to load up data ...")

people = []

try:
    with open(os.getcwd() + '\data\\' + profile_data_name+'.csv', 'r') as csvfile:
        dump = list(csv.reader(csvfile))
        for row in dump:
            people.append([clean_text(r[2:-1]) for r in row])
    
    print("... data successfully uploaded.")
    
except:
        
    print(".. no back up data, please connect to server.")

Trying to load up data ...
... data successfully uploaded.


In [5]:
#Currently takes the profile data, gets rid of people with insufficent data (=< 100 charracters) then prepares their profile
#data for analysis. Sets text_data to be a list with first name, second name, prepared profile.

people_prepared = []

for person in people:
    if len(person[2]) > 200: 
        people_prepared.append([person[0], person[1], prepare_text_for_lda(person[2])])

In [6]:
#Theme extractor, submit a list of texts and it will extract topic_num themes each summerised by word_num worth of words each
#with individual ratings on how important the word is.
#Input: List of prepared text.
#Output: a list 

def get_themes(text_data, topic_num, word_num):
    dictionary = corpora.Dictionary(text_data)
    corpus = [dictionary.doc2bow(text) for text in text_data]
    pickle.dump(corpus, open('corpus.pkl', 'wb'))
    dictionary.save('dictionary.gensim')
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = topic_num, id2word=dictionary, passes=15)
    ldamodel.save('model5.gensim')
    return ldamodel.print_topics(num_words=word_num)

for theme in get_themes([person[2] for person in people_prepared], 5,4):
    print(theme)

(0, '0.062*"university" + 0.050*"research" + 0.024*"cambridge" + 0.021*"professor"')
(1, '0.067*"health" + 0.019*"policy" + 0.014*"research" + 0.009*"development"')
(2, '0.027*"director" + 0.026*"business" + 0.017*"management" + 0.017*"cambridge"')
(3, '0.024*"research" + 0.018*"university" + 0.013*"cambridge" + 0.012*"material"')
(4, '0.040*"policy" + 0.030*"research" + 0.024*"science" + 0.019*"university"')


In [7]:
#Set words to be the words you want to compare against, this prepares this text into a nice list. (TO ADD THEME EXTRACTION)

words = 'traditional industries reorient changing'

prepared_words = [get_lemma(word) for word in tokenize(words)]
    
#This searches the list of people for who is relevant to the words you have asked about and orderes them by what proportion of
#the words in there profile fit these words.

relevant_people = []

for person in people_prepared:
    count = 0
    for word in prepared_words:
        for profile_word in person[2]:
            count += abs(is_synonym_antonym(word,profile_word))
    if count != 0:
        relevant_people.append([person[0],person[1],count/len(person[2])])

relevant_people = sorted(relevant_people,key=lambda person: person[2], reverse = True)

#This prints the top 10 fitting people.

for i in range(10):
    print(relevant_people[i][0] + ' ' + relevant_people[i][1] + ' with probability ' + str(relevant_people[i][2]))

Ian Palmer with probability 0.18518518518518517
Robin North with probability 0.17647058823529413
Simon Warburton with probability 0.1724137931034483
Ian Bamford with probability 0.16279069767441862
Kaveh Jahanshahi with probability 0.16071428571428573
Dan Clarke with probability 0.16
Philippa Benfield with probability 0.14814814814814814
Fod Barnes with probability 0.14285714285714285
Jim Platts with probability 0.14035087719298245
Ettore Settanni with probability 0.14
