In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV

In [4]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,classification_report
from scikitplot.metrics import plot_confusion_matrix

In [5]:
data = pd.read_csv('Dataset - NLP Assignment.csv')

In [6]:
data.head()

Unnamed: 0,ID,MEMBER_ID,REASONNPSSCORE__C
0,a2p1U000000RowfQAC,0011U00000rjFKdQAM,"I showed up for my appointment, but they had m..."
1,a2p1U000000RqQqQAK,0011U00000riCSHQA2,"Staff was polite, courteous, and on time"
2,a2p1U000000RqXyQAK,0011U00000riTw7QAE,Overall care is great! It's wonderful to be a...
3,a2p1U000000Rq1LQAS,0011U00000rhu8eQAA,Like the doctor and staff at this location. Ea...
4,a2p1U000000RpiuQAC,0011U00000rk4SHQAY,The convenience and the doctors


In [7]:
# checking null values
data.isnull().sum()

ID                   0
MEMBER_ID            0
REASONNPSSCORE__C    0
dtype: int64

In [8]:
# information
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3812 entries, 0 to 3811
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ID                 3812 non-null   object
 1   MEMBER_ID          3812 non-null   object
 2   REASONNPSSCORE__C  3812 non-null   object
dtypes: object(3)
memory usage: 89.5+ KB


In [9]:
data.drop(columns=['ID','MEMBER_ID'],inplace=True)

In [10]:
data

Unnamed: 0,REASONNPSSCORE__C
0,"I showed up for my appointment, but they had m..."
1,"Staff was polite, courteous, and on time"
2,Overall care is great! It's wonderful to be a...
3,Like the doctor and staff at this location. Ea...
4,The convenience and the doctors
...,...
3807,yes
3808,Very skeptical that you will soon be without a...
3809,"Doctor Malfese is easily accessible, staff is ..."
3810,Friendly Staff that can be reached at any time


In [11]:
data['REASONNPSSCORE__C'] = data['REASONNPSSCORE__C'].astype(str)

In [12]:
data

Unnamed: 0,REASONNPSSCORE__C
0,"I showed up for my appointment, but they had m..."
1,"Staff was polite, courteous, and on time"
2,Overall care is great! It's wonderful to be a...
3,Like the doctor and staff at this location. Ea...
4,The convenience and the doctors
...,...
3807,yes
3808,Very skeptical that you will soon be without a...
3809,"Doctor Malfese is easily accessible, staff is ..."
3810,Friendly Staff that can be reached at any time


In [13]:
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
ps= PorterStemmer()
corpus = []
for i in range(len(data['REASONNPSSCORE__C'])):
    review=re.sub('[^a-z-A-Z]', ' ',data['REASONNPSSCORE__C'][i])
    review=review.lower()
    review=review.split()
    
    review=[ps.stem(word) for word in review if not word in stopwords.words('english')]
    review= ' '.join(review)
    corpus.append(review)

In [14]:
def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text

In [15]:
import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords
STOPWORDS = stopwords.words('english')
NUM_TOPICS = 10
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for text in data['REASONNPSSCORE__C']:
    tokenized_data.append(clean_text(text))
 
 
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)
 
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]
 
# Have a look at how the 20th document looks like: [(word_id, count), ...]
print(corpus[20])
# [(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2),  ...
 
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)
 
# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

[(119, 1), (120, 1), (121, 1), (122, 1), (123, 1), (124, 1)]


In [16]:
print("LDA Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))
 
print("=" * 20)
 
print("LSI Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))
 
print("=" * 20)

LDA Model:
Topic #0: 0.049*"service" + 0.037*"good" + 0.022*"like" + 0.018*"great" + 0.017*"doctor" + 0.016*"convenient" + 0.014*"convenience" + 0.014*"time" + 0.012*"xyz" + 0.012*"doctors"
Topic #1: 0.021*"doctor" + 0.016*"convenient" + 0.016*"time" + 0.013*"care" + 0.012*"get" + 0.011*"appointment" + 0.010*"recommend" + 0.009*"xyz" + 0.008*"great" + 0.007*"really"
Topic #2: 0.053*"time" + 0.031*"doctor" + 0.017*"like" + 0.017*"knowledgeable" + 0.014*"great" + 0.013*"patient" + 0.012*"felt" + 0.012*"take" + 0.012*"care" + 0.011*"staff"
Topic #3: 0.035*"appointment" + 0.029*"friendly" + 0.021*"quick" + 0.018*"staff" + 0.017*"care" + 0.016*"service" + 0.016*"easy" + 0.016*"get" + 0.014*"thorough" + 0.013*"visit"
Topic #4: 0.029*"doctor" + 0.027*"staff" + 0.020*"friendly" + 0.020*"time" + 0.017*"appointment" + 0.013*"call" + 0.011*"get" + 0.011*"office" + 0.011*"wait" + 0.009*"xyz"
Topic #5: 0.032*"easy" + 0.021*"doctor" + 0.021*"get" + 0.020*"time" + 0.017*"nurse" + 0.012*"appointment" 

In [17]:
text = "very friendly staff"
bow = dictionary.doc2bow(clean_text(text))

In [18]:
print(lsi_model[bow])
print(lda_model[bow])

[(0, 0.1753811616481526), (1, 0.196723408305234), (2, 0.809425730794846), (3, 0.3971437907201182), (4, -0.32229746263794345), (5, -0.5232440919058722), (6, -0.42113470001695125), (7, 0.5079716521675984), (8, 0.2418566606838021), (9, -0.1672235036649301)]
[(0, 0.033336245), (1, 0.033336535), (2, 0.033337314), (3, 0.033343922), (4, 0.033344388), (5, 0.03333435), (6, 0.033334147), (7, 0.033339266), (8, 0.03333478), (9, 0.69995904)]


In [19]:
from gensim import similarities
 
lda_index = similarities.MatrixSimilarity(lda_model[corpus])
 
# Let's perform some queries
similarities = lda_index[lda_model[bow]]
# Sort the similarities
similarities = sorted(enumerate(similarities), key=lambda item: -item[1])
 
# Top most similar documents:
print(similarities[:10])
# Let's see what's the most similar document
document_id, similarity = similarities[0]
print([document_id][:1000])

[(106, 1.0), (223, 1.0), (469, 1.0), (523, 1.0), (662, 1.0), (700, 1.0), (710, 1.0), (760, 1.0), (778, 1.0), (1018, 1.0)]
[106]


In [20]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
 
NUM_TOPICS = 10
 
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data['REASONNPSSCORE__C'])
 
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
 
# Let's see how the first document in the corpus looks like in different topic spaces
print(lda_Z[0])
print(nmf_Z[0])
print(lsi_Z[0])

(3812, 10)
(3812, 10)
(3812, 10)
[0.07346626 0.00666725 0.11486915 0.00666676 0.0768489  0.69481206
 0.0066673  0.00666697 0.00666699 0.00666836]
[0.24601192 0.03145724 0.31331224 0.         0.17955076 0.
 0.01871882 0.         0.         0.01151915]
[ 2.06303275  0.48396848 -1.30098383 -0.61124799  0.85166588  0.80107123
 -0.17609617  0.31362672  0.14046222  0.4824404 ]




In [21]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)
 
print("LSI Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)

LDA Model:
Topic 0:
[('quick', 151.7469246549508), ('wait', 150.52120826621388), ('long', 95.09726692235542), ('convenience', 69.19933129825102), ('needs', 59.446755751148494), ('waiting', 56.21815888099711), ('lot', 55.260642580166085), ('need', 49.50076505619195), ('good', 48.92519765035426), ('cost', 46.4821361543179)]
Topic 1:
[('time', 488.943309057142), ('doctor', 277.5732934041581), ('like', 256.92070563140345), ('feel', 227.55811760419184), ('don', 163.01104860602652), ('health', 153.3040093401519), ('took', 94.90819583871337), ('questions', 89.23303790595284), ('know', 89.19358608676733), ('care', 86.13709888463434)]
Topic 2:
[('friendly', 435.0086358654053), ('staff', 362.07712325561704), ('professional', 128.58703319288523), ('helpful', 121.39697556665477), ('recommend', 102.95715161082708), ('fast', 79.22152641575941), ('service', 73.77151952893794), ('doctor', 68.6211118604052), ('primary', 56.00592682682405), ('comfortable', 55.351193056829956)]
Topic 3:
[('knowledgeable'



In [22]:
text = "the staff is good"
x = nmf_model.transform(vectorizer.transform([text]))[0]
print(x)

[0.         0.         0.14701052 0.         0.         0.
 0.         0.         0.         0.2087578 ]


In [23]:
from sklearn.metrics.pairwise import euclidean_distances
 
def most_similar(x, Z, top_n=5):
    dists = euclidean_distances(x.reshape(1, -1), Z)
    pairs = enumerate(dists[0])
    most_similar = sorted(pairs, key=lambda item: item[1])[:top_n]
    return most_similar
 
similarities = most_similar(x, nmf_Z)
document_id, similarity = similarities[0]
print([document_id][:1000])

[1911]


In [24]:
import pandas as pd
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()

In [25]:
svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(data_vectorized)
 
df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(data))
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

In [26]:
svd = TruncatedSVD(n_components=2)
words_2d = svd.fit_transform(data_vectorized.T)
 
df = pd.DataFrame(columns=['x', 'y', 'word'])
df['x'], df['y'], df['word'] = words_2d[:,0], words_2d[:,1], vectorizer.get_feature_names()
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="word", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)



In [32]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
 
NUM_TOPICS = 10
 
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data['REASONNPSSCORE__C'])
 
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
 
text = "The economy is working better than ever"
x = lda_model.transform(vectorizer.transform([text]))[0]
print(x, x.sum())

  and should_run_async(code)
  token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')


[0.03333333 0.03333625 0.03333333 0.03333333 0.03333334 0.03333333
 0.36668547 0.36664495 0.03333333 0.03333333] 1.0


In [33]:
import pyLDAvis.sklearn
 
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

  and should_run_async(code)


#### Larger topics are more frequent in the corpus.
#### Topics closer together are more similar, topics further apart are less similar.
#### When you select a topic, you can see the most representative words for the selected topic. This measure can be a combination of how frequent or how discriminant the word is. 
#### Hovering over a word will adjust the topic sizes according to how representative the word is for the topic.