In [168]:
%load_ext autoreload
%autoreload 2


import os
import pymysql
import pandas as pd
from dotenv import load_dotenv, find_dotenv



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [180]:
df=pd.read_csv('../raw_data/model_v1_labeled_data.csv',index_col=[0])
df_copy=df.copy()
#df.columns

# Cleaning functions

In [179]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import string
import unidecode
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.pipeline import make_pipeline

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)




def cleaning(sentence, move_punc=True):
    
    # Basic cleaning
    sentence = sentence.replace('\n',' ')
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercasing 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## removing numbers
    sentence = unidecode.unidecode(sentence) # remove accents
    if move_punc==True:
        for punctuation in string.punctuation:
            sentence = sentence.replace(punctuation, '') ## removing punctuation
    # Advanced cleaning        
    tokenized_sentence = word_tokenize(sentence) ## tokenizing 
    stop_words = set(stopwords.words('english')) ## defining stopwords
    tokenized_sentence = [w for w in tokenized_sentence 
                                  if not w in stop_words] ## remove stopwords
    lemmatized_sentence = [WordNetLemmatizer().lemmatize(word, get_wordnet_pos(word))  # v --> verbs
              for word in tokenized_sentence]
    cleaned_sentence = ' '.join(word for word in lemmatized_sentence)
    return cleaned_sentence


#functions to print top words of each topic
def topic_word(vectorizer, model, topic, topwords, with_weights = True):
    """returns the top words with their weights for one topic"""
    topwords_indexes = topic.argsort()[:-topwords - 1:-1]
    if with_weights == True:
        topwords = [(vectorizer.get_feature_names_out()[i], round(topic[i],2)) for i in topwords_indexes]
    if with_weights == False:
        topwords = [vectorizer.get_feature_names_out()[i] for i in topwords_indexes]
    return topwords


def print_topics(vectorizer, model, topwords):
    """prints the different topics found by the LDA with their topwords"""
    for idx, topic in enumerate(model.components_):
        print("-"*20)
        print("Topic %d:" % (idx))
        print(topic_word(vectorizer, model, topic, topwords))
        

# Investigate most frequent words in their occupation(specialities/focus/populations) and goals


In [185]:
# create a column for their professional profile
df=df_copy
df['profile']=df.specialities+' '+df.focus+ ' '+df.population+ ' '+df.typeOfPractice
df=df[['userID','cluster_id','profile','metaGoalTitle']]


# clean the txt columns
df=df.dropna()


# create a column for cleaned data
df['goalCleaned']=df['metaGoalTitle'].apply(cleaning)
df['profileCleaned']=df['profile'].apply(cleaning)
# distribution of users have goals
round(df["cluster_id"].value_counts(normalize = True),2)

4    0.26
0    0.25
1    0.22
2    0.14
5    0.09
3    0.03
Name: cluster_id, dtype: float64

In [193]:
## Goals

In [220]:
# Vectorizers and NLP Models
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation


vectorizer = TfidfVectorizer(max_df=0.75,max_features=3000,ngram_range=(2,3))
vectorized_goals = pd.DataFrame(vectorizer.fit_transform(df["goalCleaned"]).toarray(),
                                 columns = vectorizer.get_feature_names_out())


print(f" vectorized_goals.shape = {vectorized_goals.shape}")
vectorized_goals['cluster_id']=df['cluster_id']
grp_vec_goals=vectorized_goals.groupby('cluster_id').sum().T
grp_vec_goals

for i in range(6):
    print(f'\n---------------Key words in the goals of Cluster{i} are ')
    
    df0=grp_vec_goals.iloc[:,i].sort_values(ascending=False)
    print(df0.head(10))

 vectorized_goals.shape = (6089, 3000)

---------------Key words in the goals of Cluster0 are 
cpd year              38.166161
end cpd               37.832512
end cpd year          36.395117
would like            33.532472
update knowledge      32.941311
year want             27.826467
wound care            24.235236
mental health         23.475285
increase knowledge    22.339344
cpd year want         21.703222
Name: 0.0, dtype: float64

---------------Key words in the goals of Cluster1 are 
end cpd             33.922300
cpd year            33.622147
update knowledge    32.030518
end cpd year        31.935666
would like          28.372427
wound care          25.517512
year want           23.382694
age care            18.484195
well understand     17.927555
cpd year want       17.884961
Name: 1.0, dtype: float64

---------------Key words in the goals of Cluster2 are 
would like           22.020536
cpd year             19.625711
end cpd              18.690704
end cpd year         18.3177

## Investigate profile


In [219]:
vectorizer = TfidfVectorizer(max_df=0.75,ngram_range=(1,5))
vectorized_profs = pd.DataFrame(vectorizer.fit_transform(df["profile"]).toarray(),
                                 columns = vectorizer.get_feature_names_out())


print(f" vectorized_goals.shape = {vectorized_goals.shape}")
vectorized_profs['cluster_id']=df['cluster_id']
grp_vec_profs=vectorized_profs.groupby('cluster_id').sum().T
grp_vec_profs

for i in range(6):
    print(f'\n---------------Key words in the specialities/focus/population of Cluster{i} are ')
    
    df0=grp_vec_profs.iloc[:,i].sort_values(ascending=False)
    print(df0.head(10))

 vectorized_goals.shape = (6089, 27078)

---------------Key words in the specialities/focus/population of Cluster0 are 
practice                    51.121715
clinical                    47.372365
clinical practice           46.962448
care                        46.008897
adults                      44.689800
hospital                    44.306996
clinical practice adults    42.414390
practice adults             42.405105
adults hospital             41.164339
practice adults hospital    38.941693
Name: 0.0, dtype: float64

---------------Key words in the specialities/focus/population of Cluster1 are 
practice                             46.779280
care                                 45.599062
clinical                             43.162727
clinical practice                    42.807662
hospital                             40.201527
adults                               39.686212
clinical practice adults             38.694098
practice adults                      38.685627
adults hospital   

# Investigate Goals of cluster 0

In [159]:
df=df.dropna()


# create a column for cleaned data
df['goalCleaned']=df['metaGoalTitle'].apply(cleaning)

# distribution of users have goals
round(df["cluster_id"].value_counts(normalize = True),2)

0    0.26
4    0.26
1    0.23
2    0.14
5    0.08
3    0.03
Name: cluster_id, dtype: float64

In [152]:
# Vectorizers and NLP Models
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# lets first look at cluster 0
df0=df[df.cluster_id==0]
#df0=df.copy()
vectorizer = TfidfVectorizer(max_df=0.75,ngram_range=(2,3))
vectorized_goals = pd.DataFrame(vectorizer.fit_transform(df0["goalCleaned"]).toarray(),
                                 columns = vectorizer.get_feature_names_out())


print(f" vectorized_goals.shape = {vectorized_goals.shape}")
vectorized_goals.head()

 vectorized_goals.shape = (1436, 60249)


Unnamed: 0,_considers,_considers potential,_considers potential improvement,_identifies,_identifies problems,_identifies problems issue,abcde,abcde assessment,abcde assessment patient,abcde assessment would,...,young people,young people disability,young veteran,young veteran come,youth,youth suicide,youth suicide grow,yr,yr education,yr education top
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.079152,0.079152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [153]:
# fit an LDA
n_components = 3

lda = LatentDirichletAllocation(n_components = n_components)
lda.fit(vectorized_goals)

In [154]:
# Document Mixture (of Topic)
document_mixture = lda.transform(vectorized_goals)
print(f'the shape of the  Document Mixture (of Topic) {document_mixture.shape}')
round(pd.DataFrame(document_mixture, 
                   columns = [f"Topic {i+1}" for i in range(n_components)])
      ,2)

the shape of the  Document Mixture (of Topic) (1436, 3)


Unnamed: 0,Topic 1,Topic 2,Topic 3
0,0.94,0.03,0.03
1,0.05,0.05,0.90
2,0.04,0.91,0.05
3,0.04,0.05,0.91
4,0.03,0.03,0.95
...,...,...,...
1431,0.04,0.05,0.91
1432,0.02,0.96,0.02
1433,0.07,0.07,0.87
1434,0.07,0.87,0.07


In [155]:
#report the most important topic for each review
import numpy as np
df0["most_important_topic"] = np.argmax(document_mixture, axis = 1)
df0.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df0["most_important_topic"] = np.argmax(document_mixture, axis = 1)


Unnamed: 0,Product,Status,userID,stripeCustID,num_subs,account_age,pProfileID,typeOfPractice,located,specialities,...,doc_in_activation,activated,plan_type,subscribe_days,GoalsPerYear,ratioOfAchivedGoals,metaGoalTitle,cluster_id,goalCleaned,most_important_topic
115,Ausmed Subscription Monthly,canceled,001939e3-1906-4c78-a2e3-b1a3b8d99a0d,cus_AQHrseyO28TOhU,2,2015.0,c84c8933-53e8-465b-a487-e96ea20cd7e6,residential aged care facility,large rural centre,Aged Care,...,8,1,monthly,1143,1.0,0.0,I need to know more about my field of work as ...,0,need know field work rn age care always evolve...,0
116,Ausmed Subscription Monthly,canceled,0029e22b-80c4-4568-87d7-885b065221e0,cus_BKRMErwLedH1Ka,2,2018.0,648f59d1-54f1-4d08-8e8f-e23adc90fda7,hospital,metropolitan centre,"Orthopaedics,Acute pain,Chronic pain,Preoperat...",...,7,1,quarterly,48,1.0,0.0,Have a better understanding of drug addiction ...,0,well understand drug addiction understand best...,2
117,Ausmed Subscription Monthly,canceled,0096dc53-ce9d-42ab-838e-a9983fd0d172,cus_CaRpSGzjzvl5Ix,2,2021.0,eea0c119-ab67-456d-86c5-d7d57075e358,residential aged care facility,metropolitan centre,Residential Aged Care,...,3,1,quarterly,610,2.0,0.0,Learning about wounds and how to care for them...,0,learn wound care community age care . get well...,1
118,Ausmed Subscription Monthly,canceled,0196e935-821b-49cc-a899-34c409047f21,cus_CY2sRFvl9pELB1,2,1945.0,d3e85bfc-ee2e-438d-ab34-67b63477ffda,hospital,metropolitan centre,"Orthopaedics,Anaesthetics,Recovery",...,5,1,quarterly,62,2.0,0.0,To become competent in Anaesthetic nursing I w...,0,become competent anaesthetic nursing would ide...,2
122,Ausmed Subscription,canceled,028193de-681c-4c2f-9d5c-3430d8eab20e,cus_AWCNB48DEhCpRa,2,1954.0,a61cbf1e-9167-42a4-8eff-2bb9f1ae8030,private practice,capital city,"Infertility,Surrogacy",...,11,1,quarterly,754,1.333333,0.25,Broaden and update knowledge of all aspects of...,0,broaden update knowledge aspect fertility prac...,2


In [156]:
#Topic Mixture (of Words)
topic_mixture = pd.DataFrame(lda.components_, 
                             columns = vectorizer.get_feature_names_out())
topic_mixture.shape

(3, 60249)

In [157]:
#functions to print top words of each topic
def topic_word(vectorizer, model, topic, topwords, with_weights = True):
    """returns the top words with their weights for one topic"""
    topwords_indexes = topic.argsort()[:-topwords - 1:-1]
    if with_weights == True:
        topwords = [(vectorizer.get_feature_names_out()[i], round(topic[i],2)) for i in topwords_indexes]
    if with_weights == False:
        topwords = [vectorizer.get_feature_names_out()[i] for i in topwords_indexes]
    return topwords


def print_topics(vectorizer, model, topwords):
    """prints the different topics found by the LDA with their topwords"""
    for idx, topic in enumerate(model.components_):
        print("-"*20)
        print("Topic %d:" % (idx))
        print(topic_word(vectorizer, model, topic, topwords))
        
print_topics(vectorizer, lda, topwords = 10)

--------------------
Topic 0:
[('knowledge', 10.61), ('care', 8.12), ('learn', 7.62), ('practice', 7.16), ('skill', 6.36), ('cpd', 6.08), ('year', 5.85), ('end', 5.85), ('patient', 5.81), ('nursing', 5.43)]
--------------------
Topic 1:
[('knowledge', 15.65), ('care', 14.06), ('learn', 13.47), ('practice', 9.89), ('year', 9.46), ('wound', 9.0), ('management', 8.89), ('cpd', 8.78), ('end', 8.78), ('nursing', 8.71)]
--------------------
Topic 2:
[('knowledge', 17.84), ('care', 13.81), ('update', 12.18), ('learn', 10.57), ('update knowledge', 10.38), ('year', 10.23), ('end', 9.53), ('wound', 9.29), ('practice', 9.22), ('would', 8.99)]


In [143]:
# extract important words for each user
topic_word_mixture = [topic_word(vectorizer, lda, topic, topwords = 5, with_weights = False)
                      for topic in lda.components_]
topic_word_mixture
df0["most_important_words"] = df0["most_important_topic"].apply(lambda i: topic_word_mixture[i])

In [167]:
## Investigting each clusters

def lda_key_words(df,cluster_id=0,n_topic=3, n_words=10):
    # lets first look at cluster 0
    df0=df

    vectorizer = TfidfVectorizer(min_df=2/60249,max_df=0.75,ngram_range=(2,3))
    vectorized_goals = pd.DataFrame(vectorizer.fit_transform(df0["goalCleaned"]).toarray(),
                                     columns = vectorizer.get_feature_names_out())


    print(f" vectorized_goals.shape = {vectorized_goals.shape}")
    vectorized_goals.head()
    
    n_components = n_topic

    lda = LatentDirichletAllocation(n_components = n_components)
    lda.fit(vectorized_goals)
    
    
    #report the most important topic for each review
    document_mixture = lda.transform(vectorized_goals)
    df0["most_important_topic"] = np.argmax(document_mixture, axis = 1)
    #Topic Mixture (of Words)
    topic_mixture = pd.DataFrame(lda.components_, columns = vectorizer.get_feature_names_out())
    
    print(f'\n Users in Cluster{cluster_id} cares about')
    print_topics(vectorizer, lda, topwords = 10)
    
    
    # extract important words for each user
    topic_word_mixture = [topic_word(vectorizer, lda, topic, topwords = 5, with_weights = False) 
                          for topic in lda.components_]
    
    df0["most_important_words"] = df0["most_important_topic"].apply(lambda i: topic_word_mixture[i])
    
    return df0
    
    
    

lda_key_words(df,cluster_id=0,n_topic=3, n_words=10)
    
    
    
    
    
# goals_dic={}
# for i in range(n_clusters):
#     print(f"\n------------Cluster{i}----------------")
#     goals=df0[df0.cluster_id==i][["most_important_topic","most_important_words"]]
    
#     print('the distribution of topics is')
#     print(round(goals["most_important_topic"].value_counts(normalize=True),2))
    
#     frequency = list(goals["most_important_topic"].value_counts().index)
    
#     print('\n The most frequent words are ')
#     print([topic_word_mixture[i] for i in frequency])
    
#     # # creat a DataFrame to save this 
#     # dfi=pd.DataFrame({
#     # 'Topics': 
    
#     #})
#     #goals_dic[f'Cluster{i}']=dfi
    

KeyError: 'goalCleaned'

# full pipeline


In [123]:
from sklearn.pipeline import make_pipeline
max_df = 0.75
max_features = 5000
ngram_range = (1,2)
n_components
# Pipeline Vectorizer + LDA
pipeline = make_pipeline(
    TfidfVectorizer(max_df = max_df,
                    ngram_range = ngram_range),
    LatentDirichletAllocation(n_components = 7)
)

# Fit the pipeline on the cleaned texts
pipeline.fit(df["goalCleaned"])

In [126]:
# Transform the original cleaned texts with the pipeline
# Indeed, there is no need to get the vectorized texts first since it's done through the Pipeline

# Document Mixture with the Pipeline:
document_mixture = pipeline.transform(df["goalCleaned"]) 
document_mixture.shape 

# Topic Mixture with the Pipeline:
topic_mixture = pd.DataFrame(pipeline._final_estimator.components_,columns = pipeline[0].get_feature_names_out())
topic_mixture

Unnamed: 0,_considers,_considers potential,_identifies,_identifies problems,aaa,aaa perform,ab,ab course,aba,aba etc,...,zone,zone completion,zoom,zoom session,zostavax,zostavax need,zoster,zoster elderly,zwarteveen,zwarteveen kim
0,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857,0.142858,0.142858,0.142857,0.142857,...,0.142857,0.142857,0.142858,0.142858,0.142857,0.142857,0.246977,0.246977,0.178091,0.178091
1,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857,0.142858,0.142858,0.142857,0.142857,...,0.142857,0.142857,0.142858,0.142858,0.142857,0.142857,0.142858,0.142858,0.142857,0.142857
2,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857,0.142858,0.142858,0.142857,0.142857,...,0.142857,0.142857,0.264174,0.264174,0.142857,0.142857,0.144909,0.144909,0.142857,0.142857
3,0.142857,0.142857,0.142857,0.142857,0.267509,0.267509,0.142858,0.142858,0.142857,0.142857,...,0.142857,0.142857,0.142858,0.142858,0.254027,0.254027,0.142858,0.142858,0.142858,0.142858
4,0.242838,0.242838,0.242838,0.242838,0.142857,0.142857,0.142858,0.142858,0.142857,0.142857,...,0.142857,0.142857,0.142858,0.142858,0.142857,0.142857,0.142858,0.142858,0.142858,0.142858
5,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857,0.142858,0.142858,0.1922,0.1922,...,0.495231,0.495231,0.142858,0.142858,0.142857,0.142857,0.142858,0.142858,0.142857,0.142857
6,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857,0.677532,0.677532,0.142857,0.142857,...,0.142857,0.142857,0.142858,0.142858,0.142857,0.142857,0.142858,0.142858,0.142857,0.142857


# Specialities



In [76]:
txt_cols=['specialities', 'population', 'focus','metaGoalTitle']
df_txt=df.copy().loc[:,txt_cols]
df_txt=df_txt.select_dtypes(include = ['object'])
df_txt.fillna('Unknown',inplace=True)

df_txt.specialities=df_txt.specialities.apply(lambda x: x.replace(" ","").replace(";",",").replace("/",",").split(","))


# get a list of all possible specialization
spec_list=[]
for row in df_txt.specialities:
    spec_list+=row

words=list(set(spec_list))
counts=[spec_list.count(word) for word in words]

spec_list_df=pd.DataFrame({
    'specialities': words,
    'count':counts

})

words

['',
 'ChildrenSurgical',
 'plastics',
 'Emergencyandcommunityparamedicine',
 'Cardiacpractice',
 'DiabetesEducation&CriticalCare',
 'Generalnursingincludingrespiratory',
 'PostAcuteNursingCare',
 'subacutenursing',
 'airwaymanagement',
 'Cannulation',
 'RemoteAreaNurse',
 'OperatingTheatres',
 'PalliativeandSupportivecare',
 'ClinicalProcurement',
 'PACUandpracticemanageranaesthetics',
 'emer',
 'Retrieval',
 'GPMP',
 'Peercounseling',
 'BreastCancerCareNursing',
 'dementiaaddivtionquality',
 'Cancer',
 'Medicalandsurgicalandagedcare',
 'semiacute',
 'oncologywoundcare',
 'Perioperative',
 'PrehospitalEmergencyCare',
 'ChronicDiseasePalliativeCareAgedCare',
 'AcuteCare',
 'Test',
 'BoneandMineralDisorders',
 'subacute',
 'allhealthcareareas',
 'HACCHospitals',
 'Donorservices',
 'medsurgorthopaedic',
 'Neonatalcriticalandspecialcare',
 'Clinical-nurse-educator',
 'Auditing',
 'daysurgery',
 'Subacutecare',
 'Pressureinjury',
 'Trauma',
 'smallprocedures',
 'medicalnursing',
 'Cssd',
 

In [178]:
df.specialities.str.split(',')
df.focus.unique()

array(['clinical practice', 'nurse education', 'care coordination',
       'management', 'Provision of Care', nan, 'Patient care',
       'health promotion', 'patient education',
       'Immunisation, wound care, assisting medical team with procedures, recording INR and performing ECGs',
       'Other', 'Sales of point of care testing devices',
       'Small procedures in Private Rooms,and post care for patients.',
       'Health professional education', 'emergency care',
       'Administering Medication\nWound Care \nAssisting with Medical Officer Round ',
       'Looking after the residents giving medications doing the treatments. ',
       'Chronic disease ', 'other', 'carer training',
       'Pre hospital care ', 'research',
       'not in heathl care at the moment', 'Perioperative Nursing',
       'GP Shared Care program in addition to providing clinical care in periods of high demand.',
       'Currently unemployed and doing Certificate 111 Phlebotomy.',
       'Pt care', 'Parame

In [87]:
import nltk
print(nltk.pos_tag(['feet']))

[('feet', 'NNS')]
