In [13]:
%load_ext autoreload
%autoreload 2


import os
import pymysql
import pandas as pd
from dotenv import load_dotenv, find_dotenv



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
df=pd.read_csv('../raw_data/model_v1_labeled_data.csv',index_col=[0])
df_copy=df.copy()
#df.columns

# Cleaning functions

In [15]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import string
import unidecode
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.pipeline import make_pipeline

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)




def cleaning(sentence, move_punc=True):
    
    # Basic cleaning
    sentence = sentence.replace('\n',' ')
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercasing 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## removing numbers
    sentence = unidecode.unidecode(sentence) # remove accents
    if move_punc==True:
        for punctuation in string.punctuation:
            sentence = sentence.replace(punctuation, '') ## removing punctuation
    # Advanced cleaning        
    tokenized_sentence = word_tokenize(sentence) ## tokenizing 
    stop_words = set(stopwords.words('english')) ## defining stopwords
    tokenized_sentence = [w for w in tokenized_sentence 
                                  if not w in stop_words] ## remove stopwords
    lemmatized_sentence = [WordNetLemmatizer().lemmatize(word, get_wordnet_pos(word))  # v --> verbs
              for word in tokenized_sentence]
    cleaned_sentence = ' '.join(word for word in lemmatized_sentence)
    return cleaned_sentence


#functions to print top words of each topic
def topic_word(vectorizer, model, topic, topwords, with_weights = True):
    """returns the top words with their weights for one topic"""
    topwords_indexes = topic.argsort()[:-topwords - 1:-1]
    if with_weights == True:
        topwords = [(vectorizer.get_feature_names_out()[i], round(topic[i],2)) for i in topwords_indexes]
    if with_weights == False:
        topwords = [vectorizer.get_feature_names_out()[i] for i in topwords_indexes]
    return topwords


def print_topics(vectorizer, model, topwords):
    """prints the different topics found by the LDA with their topwords"""
    for idx, topic in enumerate(model.components_):
        print("-"*20)
        print("Topic %d:" % (idx))
        print(topic_word(vectorizer, model, topic, topwords))
        

# Investigate most frequent words in their occupation(specialities/focus/populations) and goals


In [16]:
# create a column for their professional profile
df=df_copy
df['profile']=df.specialities+' '+df.focus+ ' '+df.population+ ' '+df.typeOfPractice
df=df[['userID','cluster_id','profile','metaGoalTitle']]


# clean the txt columns
df=df.dropna()


# create a column for cleaned data
df['goalCleaned']=df['metaGoalTitle'].apply(cleaning)
df['profileCleaned']=df['profile'].apply(cleaning)
# distribution of users have goals
round(df["cluster_id"].value_counts(normalize = True),2)

4    0.26
0    0.25
1    0.22
2    0.14
5    0.09
3    0.03
Name: cluster_id, dtype: float64

In [17]:
## Goals

In [18]:
# Vectorizers and NLP Models
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation


vectorizer = TfidfVectorizer(max_df=0.75,max_features=3000,ngram_range=(2,3))
vectorized_goals = pd.DataFrame(vectorizer.fit_transform(df["goalCleaned"]).toarray(),
                                 columns = vectorizer.get_feature_names_out())


print(f" vectorized_goals.shape = {vectorized_goals.shape}")
vectorized_goals['cluster_id']=df['cluster_id']
grp_vec_goals=vectorized_goals.groupby('cluster_id').sum().T
grp_vec_goals

for i in range(6):
    print(f'\n---------------Key words in the goals of Cluster{i} are ')
    
    df0=grp_vec_goals.iloc[:,i].sort_values(ascending=False)
    print(df0.head(10))

 vectorized_goals.shape = (6089, 3000)

---------------Key words in the goals of Cluster0 are 
cpd year              38.166161
end cpd               37.832512
end cpd year          36.395117
would like            33.532472
update knowledge      32.941311
year want             27.826467
wound care            24.235236
mental health         23.475285
increase knowledge    22.339344
cpd year want         21.703222
Name: 0.0, dtype: float64

---------------Key words in the goals of Cluster1 are 
end cpd             33.922300
cpd year            33.622147
update knowledge    32.030518
end cpd year        31.935666
would like          28.372427
wound care          25.517512
year want           23.382694
age care            18.484195
well understand     17.927555
cpd year want       17.884961
Name: 1.0, dtype: float64

---------------Key words in the goals of Cluster2 are 
would like           22.020536
cpd year             19.625711
end cpd              18.690704
end cpd year         18.3177

## Investigate profile


In [19]:
vectorizer = TfidfVectorizer(max_df=0.75,ngram_range=(1,5))
vectorized_profs = pd.DataFrame(vectorizer.fit_transform(df["profile"]).toarray(),
                                 columns = vectorizer.get_feature_names_out())


print(f" vectorized_goals.shape = {vectorized_goals.shape}")
vectorized_profs['cluster_id']=df['cluster_id']
grp_vec_profs=vectorized_profs.groupby('cluster_id').sum().T
grp_vec_profs

for i in range(6):
    print(f'\n---------------Key words in the specialities/focus/population of Cluster{i} are ')
    
    df0=grp_vec_profs.iloc[:,i].sort_values(ascending=False)
    print(df0.head(10))

 vectorized_goals.shape = (6089, 3001)

---------------Key words in the specialities/focus/population of Cluster0 are 
practice                    51.121715
clinical                    47.372365
clinical practice           46.962448
care                        46.008897
adults                      44.689800
hospital                    44.306996
clinical practice adults    42.414390
practice adults             42.405105
adults hospital             41.164339
practice adults hospital    38.941693
Name: 0.0, dtype: float64

---------------Key words in the specialities/focus/population of Cluster1 are 
practice                             46.779280
care                                 45.599062
clinical                             43.162727
clinical practice                    42.807662
hospital                             40.201527
adults                               39.686212
clinical practice adults             38.694098
practice adults                      38.685627
adults hospital    

# LDA on all clusters

In [20]:
df=df.dropna()


# create a column for cleaned data
df['goalCleaned']=df['metaGoalTitle'].apply(cleaning)

# distribution of users have goals
round(df["cluster_id"].value_counts(normalize = True),2)

4    0.26
0    0.25
1    0.22
2    0.14
5    0.09
3    0.03
Name: cluster_id, dtype: float64

In [21]:
# Vectorizers and NLP Models
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# lets first look at cluster 0
#df0=df[df.cluster_id==0]
df0=df.copy()
vectorizer = TfidfVectorizer(max_df=0.75,ngram_range=(2,3))
vectorized_goals = pd.DataFrame(vectorizer.fit_transform(df0["goalCleaned"]).toarray(),
                                 columns = vectorizer.get_feature_names_out())


print(f" vectorized_goals.shape = {vectorized_goals.shape}")
vectorized_goals.head()

 vectorized_goals.shape = (6089, 251349)


Unnamed: 0,aaa perform,aaa perform suitable,aaa repair,aaa repair surgery,aag pca,aag pca nswnma,aastn annual,aastn annual conference,aastn meet,aastn meet company,...,zone prevention,zone prevention improve,zoom session,zoom session effectively,zostavax need,zostavax need update,zoster elderly,zoster elderly able,zwarteveen kim,zwarteveen kim nguyen
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
# fit an LDA
n_components = 3

lda = LatentDirichletAllocation(n_components = n_components)
lda.fit(vectorized_goals)

In [23]:
# Document Mixture (of Topic)
document_mixture = lda.transform(vectorized_goals)
print(f'the shape of the  Document Mixture (of Topic) {document_mixture.shape}')
round(pd.DataFrame(document_mixture, 
                   columns = [f"Topic {i+1}" for i in range(n_components)])
      ,2)

the shape of the  Document Mixture (of Topic) (6089, 3)


Unnamed: 0,Topic 1,Topic 2,Topic 3
0,0.03,0.03,0.94
1,0.06,0.06,0.88
2,0.90,0.05,0.05
3,0.90,0.05,0.05
4,0.08,0.85,0.07
...,...,...,...
6084,0.07,0.87,0.07
6085,0.84,0.08,0.08
6086,0.92,0.04,0.04
6087,0.94,0.03,0.03


In [24]:
#report the most important topic for each review
import numpy as np
df0["most_important_topic"] = np.argmax(document_mixture, axis = 1)
df0.head()

Unnamed: 0,userID,cluster_id,profile,metaGoalTitle,goalCleaned,profileCleaned,most_important_topic
115,001939e3-1906-4c78-a2e3-b1a3b8d99a0d,0,Aged Care clinical practice older people resid...,I need to know more about my field of work as ...,need know field work rn age care always evolve...,age care clinical practice old people resident...,2
116,0029e22b-80c4-4568-87d7-885b065221e0,0,"Orthopaedics,Acute pain,Chronic pain,Preoperat...",Have a better understanding of drug addiction ...,well understand drug addiction understand best...,orthopaedicsacute painchronic painpreoperative...,2
117,0096dc53-ce9d-42ab-838e-a9983fd0d172,0,Residential Aged Care clinical practice older ...,Learning about wounds and how to care for them...,learn wound care community age care get well t...,residential age care clinical practice old peo...,0
118,0196e935-821b-49cc-a899-34c409047f21,0,"Orthopaedics,Anaesthetics,Recovery clinical pr...",To become competent in Anaesthetic nursing I w...,become competent anaesthetic nursing would ide...,orthopaedicsanaestheticsrecovery clinical prac...,0
120,01d17791-4a67-43c9-b35f-06404ba4108e,1,Acute care clinical practice adults hospital,By the end of CPD year i want to improve gener...,end cpd year want improve general knowledge cl...,acute care clinical practice adult hospital,1


In [25]:
#Topic Mixture (of Words)
topic_mixture = pd.DataFrame(lda.components_, 
                             columns = vectorizer.get_feature_names_out())
topic_mixture.shape

(3, 251349)

In [26]:
#functions to print top words of each topic
def topic_word(vectorizer, model, topic, topwords, with_weights = True):
    """returns the top words with their weights for one topic"""
    topwords_indexes = topic.argsort()[:-topwords - 1:-1]
    if with_weights == True:
        topwords = [(vectorizer.get_feature_names_out()[i], round(topic[i],2)) for i in topwords_indexes]
    if with_weights == False:
        topwords = [vectorizer.get_feature_names_out()[i] for i in topwords_indexes]
    return topwords


def print_topics(vectorizer, model, topwords):
    """prints the different topics found by the LDA with their topwords"""
    for idx, topic in enumerate(model.components_):
        print("-"*20)
        print("Topic %d:" % (idx))
        print(topic_word(vectorizer, model, topic, topwords))
        
print_topics(vectorizer, lda, topwords = 10)

--------------------
Topic 0:
[('update knowledge', 42.05), ('would like', 29.96), ('cpd year', 27.84), ('end cpd', 27.15), ('end cpd year', 26.41), ('knowledge wound', 26.31), ('update knowledge wound', 23.88), ('wound dressing', 22.63), ('knowledge wound dressing', 19.91), ('increase knowledge', 19.71)]
--------------------
Topic 1:
[('end cpd', 28.0), ('cpd year', 27.87), ('end cpd year', 26.96), ('wound care', 24.82), ('year want', 22.51), ('would like', 19.92), ('update knowledge', 18.89), ('cpd year want', 18.27), ('mental health', 15.22), ('end year', 13.48)]
--------------------
Topic 2:
[('would like', 20.2), ('end cpd', 19.09), ('cpd year', 17.68), ('end cpd year', 16.94), ('year want', 16.45), ('update knowledge', 16.01), ('age care', 12.75), ('end year', 12.12), ('cpd year want', 11.51), ('improve knowledge', 9.92)]


In [27]:
# extract important words for each user
topic_word_mixture = [topic_word(vectorizer, lda, topic, topwords = 5, with_weights = False)
                      for topic in lda.components_]
topic_word_mixture
df0["most_important_words"] = df0["most_important_topic"].apply(lambda i: topic_word_mixture[i])

## Make it a function

In [28]:
## Investigting each clusters
import pandas as pd
df=pd.read_csv('../raw_data/model_v3_labeled_data.csv',index_col=[0])

# clean the txt columns
df=df.dropna()
df['profile']=df.specialities+' '+df.focus+ ' '+df.population+ ' '+df.typeOfPractice
df=df[['userID','cluster_id','profile','metaGoalTitle']]

# create a column for cleaned data
df['goalCleaned']=df['metaGoalTitle'].apply(cleaning)
df['profileCleaned']=df['profile'].apply(cleaning)

def lda_key_words(df,col_name="goalCleaned",n_topic=3, n_words=10):
    
    df0=df
    
    
    vectorizer = TfidfVectorizer(min_df=2/60249,max_df=0.75,ngram_range=(2,3))
    vectorized_txt = pd.DataFrame(vectorizer.fit_transform(df0[col_name]).toarray(),
                                     columns = vectorizer.get_feature_names_out())


    print(f" vectorized_goals.shape = {vectorized_goals.shape}")
    vectorized_txt.head()
    
    n_components = n_topic

    lda = LatentDirichletAllocation(n_components = n_components)
    lda.fit(vectorized_txt)
    
    
    #report the most important topic for each review
    document_mixture = lda.transform(vectorized_txt)
    df0["most_important_topic"] = np.argmax(document_mixture, axis = 1)
    #Topic Mixture (of Words)
    topic_mixture = pd.DataFrame(lda.components_, columns = vectorizer.get_feature_names_out())
    
    print(f'\n Users in Cluster{cluster_id} cares about')
    print_topics(vectorizer, lda, topwords = 10)
    
    
    # extract important words for each user
    topic_word_mixture = [topic_word(vectorizer, lda, topic, topwords = 5, with_weights = False) 
                          for topic in lda.components_]
    
    df0["most_important_words"] = df0["most_important_topic"].apply(lambda i: topic_word_mixture[i])
    
    return df0
    
    
    

df1=lda_key_words(df,col_name="goalCleaned",n_topic=8, n_words=10)
#df.head()   
    
    
    
    
# goals_dic={}
# for i in range(n_clusters):
#     print(f"\n------------Cluster{i}----------------")
#     goals=df0[df0.cluster_id==i][["most_important_topic","most_important_words"]]
    
#     print('the distribution of topics is')
#     print(round(goals["most_important_topic"].value_counts(normalize=True),2))
    
#     frequency = list(goals["most_important_topic"].value_counts().index)
    
#     print('\n The most frequent words are ')
#     print([topic_word_mixture[i] for i in frequency])
    
#     # # creat a DataFrame to save this 
#     # dfi=pd.DataFrame({
#     # 'Topics': 
    
#     #})
#     #goals_dic[f'Cluster{i}']=dfi
    

 vectorized_goals.shape = (6089, 251349)


NameError: name 'cluster_id' is not defined

In [None]:
df1.head()

# full pipeline


In [None]:
from sklearn.pipeline import make_pipeline
max_df = 0.75
max_features = 5000
ngram_range = (1,2)
n_components
# Pipeline Vectorizer + LDA
pipeline = make_pipeline(
    TfidfVectorizer(max_df = max_df,
                    ngram_range = ngram_range),
    LatentDirichletAllocation(n_components = 7)
)

# Fit the pipeline on the cleaned texts
pipeline.fit(df["goalCleaned"])

In [None]:
# Transform the original cleaned texts with the pipeline
# Indeed, there is no need to get the vectorized texts first since it's done through the Pipeline

# Document Mixture with the Pipeline:
document_mixture = pipeline.transform(df["goalCleaned"]) 
document_mixture.shape 

# Topic Mixture with the Pipeline:
topic_mixture = pd.DataFrame(pipeline._final_estimator.components_,columns = pipeline[0].get_feature_names_out())
topic_mixture

# Specialities



In [None]:
txt_cols=['specialities', 'population', 'focus','metaGoalTitle']
df_txt=df.copy().loc[:,txt_cols]
df_txt=df_txt.select_dtypes(include = ['object'])
df_txt.fillna('Unknown',inplace=True)

df_txt.specialities=df_txt.specialities.apply(lambda x: x.replace(" ","").replace(";",",").replace("/",",").split(","))


# get a list of all possible specialization
spec_list=[]
for row in df_txt.specialities:
    spec_list+=row

words=list(set(spec_list))
counts=[spec_list.count(word) for word in words]

spec_list_df=pd.DataFrame({
    'specialities': words,
    'count':counts

})

words

In [None]:
df.specialities.str.split(',')
df.focus.unique()

In [None]:
import nltk
print(nltk.pos_tag(['feet']))