In [3]:
import pandas as pd
import numpy as np
import re

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from gensim.summarization.summarizer import summarize 
from gensim.summarization import keywords 
import wikipedia 

from sklearn import *
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
raw_data= pd.read_csv("digsite_text_data.csv", encoding='cp1252')
raw_data.drop(['Unnamed: 5'], inplace = True, axis = 1)
raw_data = raw_data.dropna()
raw_data

Unnamed: 0,Question,QuestionType,QuestionText,Respondent,Response
0,Introductions,Conversation,**Please introduce yourself to the group by te...,John Stephenson,Microsoft\n\nWeb APIs\n\nConfidential
1,Introductions,Conversation,**Please introduce yourself to the group by te...,Ajay Sainy,I am in a product based company that creates p...
2,Introductions,Conversation,**Please introduce yourself to the group by te...,Cole Harrison,I work for Microsoft's C+AI Org on the Common ...
3,Introductions,Conversation,**Please introduce yourself to the group by te...,Mahesh Pasupuleti,Work with Microsoft within Office 365. We deve...
4,Introductions,Conversation,**Please introduce yourself to the group by te...,Ravindra Chilukuri,"I am working for Microsoft as a vendor, Cureen..."
...,...,...,...,...,...
712,"Sr. VP, Developer Experience",Conversation,"In this exercise, imagine that you were just h...",Ravindra Chilukuri,"Provide the demos for simple uses cases, where..."
713,"Sr. VP, Developer Experience",Conversation,"In this exercise, imagine that you were just h...",Amit Dighe,1. an extremely thorough portal which would be...
714,"Sr. VP, Developer Experience",Conversation,"In this exercise, imagine that you were just h...",Bharath Bindumalam sreenivas,Integration with VS should be easier. I should...
715,"Sr. VP, Developer Experience",Conversation,"In this exercise, imagine that you were just h...",Saurabh Bansal,1. Make sure the onboarding is very seamless w...


In [5]:
for i in raw_data['Question'].unique():
    print(i)

Introductions
Main Goals
Platform Expectations
Strengths & Weaknesses
6 Stages
Resources Used
Performance Review
Ramping Up
Ultimate Match-up
Dear Microsoft Identity
Perfect Toolkit
Launch Checklist
One Thing
Best & Worst Support
Support Resources
vNext
Best Resources
Sr. VP, Developer Experience


In [124]:
data = raw_data[raw_data['Question'] == "Ultimate Match-up"]

In [125]:
def combine_text(data):
    paragraph_full = ["".join(i) for i in data["Response"]]
    paragraph_full = " ".join(paragraph_full)
    return paragraph_full

def clean_questions(text):
    text = re.sub("(\s\d+)","", text.lower())
    return text

def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    text = re.sub("(\s\d+)","", text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    stop = stopwords.words('english')
    text = " ".join([w for w in [w for w in text.split()] if w not in stop])
    return text


def tokenizer_porter(text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in text.split()]

In [126]:
paragraph = preprocessor(paragraph_full)

In [127]:
paragraph

'compare new relic tool add monitoring services three criteria good documentation b packages different platform easy integrate project c whats news migration guides new features tie great documentation b new relic easy versatile packages platform c lake area new relic faster understand get started surprised google apple listed talking tech providers identity platform providers community support documentation reliability without doubt regardless current employment pick microsoft winner microsoft essentially initially corporated make developers life easier ms products businesses developers good proof concept imo ms meets criteria mentioned 1 plus many aspects like security privacy dynamicity popularity accessibility etc microsoft easy onboarding user experience support tried rest heard okta auth0 seems much easy onboard clean user experience great support say tried rest reliability documentation supportability microsoft micorsoft offers good product good documentation support microsoft h

In [128]:
count = CountVectorizer(stop_words='english',
                        max_df=.1,
                        max_features=5000)

X = count.fit_transform(paragraph)

ValueError: Iterable over raw text documents expected, string object received.

In [129]:
lda = LatentDirichletAllocation(n_components=4,
                                random_state=123,
                                learning_method='batch')
X_topics = lda.fit_transform(X)

In [130]:
lda.components_.shape

(4, 187)

In [131]:
n_top_words = 10
feature_names = count.get_feature_names()

for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d:" % (topic_idx + 1))
    print(" ".join([feature_names[i]
                    for i in topic.argsort()\
                        [:-n_top_words - 1:-1]]))

NotFittedError: Vocabulary not fitted or provided

In [132]:
import en_core_web_lg

nlp = en_core_web_lg.load() 
doc = nlp(paragraph_full) 

In [193]:
def too_similar(summary, quesiton):
    summary = sent_tokenize(summary)
    
    for i in summary:
        tfidf = feature_extraction.text.TfidfVectorizer()
        text_fit1 = tfidf.fit_transform(quesiton)
        text_fit2 = tfidf.transform([i])
        similarity = metrics.pairwise.cosine_similarity(text_fit1, text_fit2)
        
        if similarity.any() > .75:
            summary.remove(i)
            
    return summary

In [213]:
for i in raw_data["Question"].unique():
    print(i)
    sum_data = raw_data[(raw_data['Question'] == i) & (raw_data['QuestionType'] == "Conversation")]
    question = sent_tokenize(" ".join(str(x) for x in sum_data["QuestionText"].unique()))
    for i in question:
        print(i)
    print("-"*70)
    x = combine_text(sum_data)
    summary = summarize(x, ratio=.2) 
    summary = too_similar(summary, question)
    for i in summary:
        print(i) 
    print("\n"*2)

Introductions
**Please introduce yourself to the group by telling everyone...** - The Microsoft team you work in.
- What kinds of applications or solutions you develop.
- The type of project you’re currently working on.
----------------------------------------------------------------------
I work for Microsoft's C+AI Org on the Common Data Services solution which supports Dynamics CRM and the Power Platform
Currently I work on the Web Servers team, modernizing our deployment and infrastructure story for the compute used by that solution.
My name is Jiankai Yu.
Provides contact information for subscription admins to internal communication teams  I work for Business Application Group organization in Microsoft.
We provide voice application services for Teams and SfB
I am mostly working on web applications right now and also involved with mobile development
working on stabilize and add monitoring to our services I'm currently working on Power Automate team and we build different solutions 

In [216]:
tfidf = feature_extraction.text.TfidfVectorizer()
text_fit1 = tfidf.fit_transform(["What are some of the things you have done to **ramp up on the Microsoft identity platform and capabilities**?"])
text_fit2 = tfidf.transform(["[What are some of the things you have done to ramp up on the Microsoft identity platform and capabilities?]"])
metrics.pairwise.cosine_similarity(text_fit1, text_fit2)

array([[1.]])

In [137]:
# Get wiki content. 
wikisearch = wikipedia.page("Latter Day Saints") 
wikicontent = wikisearch.content 
  
# Summary (0.5% of the original content). 
summ_per = summarize(wikicontent, ratio = 0.1) 
print("Percent summary") 
print(summ_per) 
print()

Percent summary
A minority of Latter Day Saint adherents, such as members of Community of Christ, believe in traditional Protestant theology, and have distanced themselves from some of the distinctive doctrines of the LDS Church.
This change resulted in the formation of a number of small sects who sought to maintain polygamy and other 19th-century doctrines and practices, now referred to as "Mormon fundamentalism".Other groups originating within the Latter Day Saint movement followed different paths in Missouri, Illinois, Michigan, and Pennsylvania.
The largest of these, Community of Christ (originally known as the "Reorganized Church of Jesus Christ of Latter Day Saints"), was formed in Illinois in 1860 by several groups uniting around Smith's son, Joseph Smith III.
The second-largest denomination is the Missouri-based Community of Christ (formerly the Reorganized Church of Jesus Christ of Latter Day Saints) which reports 197,000 members.
Small denominations that trace their origins t