In [18]:
#Анализ информации с #https://webdevblog.ru/tematicheskoe-modelirovanie-s-pomoshhju-gensim-python/
# и # https://medium.com/@yanlinc/how-to-build-a-lda-topic-model-using-from-text-601cdcbfd3a6

# скачивание библиотек 
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim
# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

In [41]:
# medium.com ссылка на БД, данные этого типа можно открыть с помощью excel https://lumpics.ru/how-to-open-csv/
df = pd.read_csv('googlePlayStore_review_LDA.csv', error_bad_lines=False)
df = df.dropna(subset=['Translated_Review'])
df.head(14)

Unnamed: 0.1,Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Topic_key_word
0,0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333,Account Problem
1,1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462,Download/Internet Access
2,3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875,Language/Recommend/Screen Size
3,4,10 Best Foods for You,Best idea us,Positive,1.0,0.3,Notification/Support
4,5,10 Best Foods for You,Best way,Positive,1.0,0.3,Notification/Support
5,6,10 Best Foods for You,Amazing,Positive,0.6,0.9,Notification/Support
6,8,10 Best Foods for You,"Looking forward app,",Neutral,0.0,0.0,Account Problem
7,9,10 Best Foods for You,It helpful site ! It help foods get !,Neutral,0.0,0.0,Account Problem
8,10,10 Best Foods for You,good you.,Positive,0.7,0.6,Notification/Support
9,11,10 Best Foods for You,Useful information The amount spelling errors ...,Positive,0.2,0.1,Notification/Support


In [43]:
# представление в виде листа
data = df.Translated_Review.values.tolist()
# удаление имейлов
data = [re.sub(r'\S*@\S*\s?', '', sent) for sent in data]
# удаление знаков припенания
data = [re.sub(r'\s+', ' ', sent) for sent in data]
# удаление ковычек '
data = [re.sub(r"\'", "", sent) for sent in data]
pprint(data[:1])

['I like eat delicious food. Thats Im cooking food myself, case "10 Best '
 'Foods" helps lot, also "Best Before (Shelf Life)"']


In [44]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True удаляет знак пунктуации
# перевод предложений в слова        
data_words = list(sent_to_words(data))
print(data_words[:1])

[['like', 'eat', 'delicious', 'food', 'thats', 'im', 'cooking', 'food', 'myself', 'case', 'best', 'foods', 'helps', 'lot', 'also', 'best', 'before', 'shelf', 'life']]


In [45]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out
# выполнение лемматизации (перевод слова в наипростойший его вид), работает с глаголами, существительными, прилагательными и причастиями

In [46]:
nlp = spacy.load('en', disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'VERB']) #выбирает сущиствительные и глаголы
print(data_lemmatized[:2])

['eat food s be cook food case food help lot shelf life', 'help eat exercise basis']


In [47]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,
# минимальные значения reqd в слове 
                             stop_words='english',             
# удаляет стоп слова
                             lowercase=True,                   
# переводит все свова в малый шрифт
                             token_pattern='[a-zA-Z0-9]{3,}',  
# слова содержат > 3 символов
                            max_features=50000,             
# максимальное количество слов uniq    
                            )
data_vectorized = vectorizer.fit_transform(data_lemmatized)


In [48]:
#  LDA Модель
lda_model = LatentDirichletAllocation(n_components=20,               # Количество тем
                                      max_iter=10,               
# иаксимальное число итераций
                                      learning_method='online',   
                                      random_state=100,          
# Random state
                                      batch_size=128,            
# n документов вкаждой обучающей итерации
                                      evaluate_every = -1,       
# вычислите несостыковки каждые n итераций, по умолчанию: не надо
                                      n_jobs = -1,               
# Использовать все возможные процессоры
                                     )
lda_output = lda_model.fit_transform(data_vectorized)
print(lda_model)  # параметры модели

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=20, n_jobs=-1,
                          perp_tol=0.1, random_state=100, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)


In [49]:
# Логарифмическая вероятность: чем выше, тем лучше
print("Log Likelihood: ", lda_model.score(data_vectorized))
# Смещение: чем ниже, тем лучше. смещение = exp(-1. * лог-вероятность на слово)
print("Perplexity: ", lda_model.perplexity(data_vectorized))

pprint(lda_model.get_params())

Log Likelihood:  -2047855.4042189221
Perplexity:  1010.992229510251
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 20,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


In [50]:
# Определение поисковых параметров
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}
# Инициализация модели
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
# Инициализация графика для поиска оптимальной модели
model = GridSearchCV(lda, param_grid=search_params)
# Сделать поиск по графику
model.fit(data_vectorized)
GridSearchCV(cv=None, error_score='raise',
       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0),
        iid=True, n_jobs=1,
       param_grid={ 'learning_decay': [0.5, 0.7, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

GridSearchCV(cv=None, error_score='raise',
             estimator=LatentDirichletAllocation(batch_size=128,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method=None,
                                                 learning_offset=10.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=10,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=1,
                                                 perp_tol=0.1,
                                                 random_state=None,
                                                 topic_word_prior=None,
                                                 total_

In [51]:
# Лучшая модель
best_lda_model = model.best_estimator_
# Параметры модели
print("Best Model's Params: ", model.best_params_)
# Логарифмической вероятности счет
print("Best Log Likelihood Score: ", model.best_score_)
# Смещение
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.9, 'n_components': 10}
Best Log Likelihood Score:  -682868.2210066896
Model Perplexity:  745.1681902680335


In [52]:
# Создание документа — Topic Matrix (каждый документ- это строка)
lda_output = best_lda_model.transform(data_vectorized)
# Название колон
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
# Индекс имен
docnames = ["Doc" + str(i) for i in range(len(data))]
# Делаем  pandas ДБ
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Получаем доминирующую тему для каждого документа
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic
# Стиль 
def color_green(val):
 color = 'green' if val > .1 else 'black'
 return 'color: {col}'.format(col=color)
def make_bold(val):
 weight = 700 if val > .1 else 400
 return 'font-weight: {weight}'.format(weight=weight)
# Применяем стиль
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
Doc0,0.01,0.01,0.01,0.01,0.01,0.91,0.01,0.01,0.01,0.01,5
Doc1,0.02,0.47,0.02,0.02,0.02,0.37,0.02,0.02,0.02,0.02,1
Doc2,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.77,0.03,0.03,7
Doc3,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.55,0.05,0.05,7
Doc4,0.05,0.55,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,1
Doc5,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0
Doc6,0.05,0.55,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,1
Doc7,0.2,0.02,0.02,0.02,0.02,0.42,0.02,0.02,0.25,0.02,5
Doc8,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0
Doc9,0.01,0.01,0.01,0.01,0.52,0.27,0.13,0.01,0.01,0.01,4


In [53]:
# Матрица ключевых тематических слов
df_topic_keywords = pd.DataFrame(best_lda_model.components_)
# Назначение столбца и индекса
df_topic_keywords.columns = vectorizer.get_feature_names()
df_topic_keywords.index = topicnames
# Просмотр
df_topic_keywords.head(10)

Unnamed: 0,abandon,ability,abuse,accept,access,accessory,accident,accommodation,accomplish,accord,...,yard,yardage,year,yesterday,yoga,youtube,zip,zombie,zone,zoom
Topic0,0.103101,22.533987,0.137453,0.106028,0.145951,0.102304,4.021734,0.109082,10.091014,39.6443,...,0.106333,2.640943,623.268152,0.105581,0.126286,0.103588,0.104383,0.103681,0.107657,0.102542
Topic1,0.102084,0.106713,0.11034,0.113076,0.102733,10.864544,4.080986,0.109971,3.681558,0.103889,...,0.106721,0.518119,9.380789,0.105139,0.104323,0.112643,0.571572,0.106857,0.299574,32.349009
Topic2,0.32984,0.102598,0.372234,0.10244,0.130243,0.110356,0.106858,0.102286,0.109104,0.102076,...,0.10173,0.16743,172.196196,0.103916,0.102372,0.110872,0.132471,15.932525,14.517187,0.104326
Topic3,13.925415,0.117014,0.110713,62.898049,354.332934,0.101966,0.113849,0.110195,0.102764,0.102409,...,1.650556,0.761813,0.155832,0.103092,0.111545,54.271845,0.149447,0.10819,0.326489,28.708928
Topic4,0.104097,1.081449,0.150666,0.103543,0.218386,0.112976,0.10494,0.104079,0.103065,0.127591,...,0.113832,0.268793,49.673073,2.487472,50.396425,0.103758,0.10824,0.277668,0.105378,10.861245
Topic5,0.111952,86.977672,1.280292,0.103523,0.119549,0.113185,0.10231,9.986152,0.105586,0.110693,...,0.103671,6.281533,23.08726,0.10296,0.104129,0.105476,0.101731,0.114951,0.102276,0.10807
Topic6,0.102973,15.93291,0.102345,34.234885,203.801347,0.103184,0.106111,0.102406,0.141298,0.103284,...,0.102304,1.808783,65.645394,25.547233,0.107675,0.110253,0.104007,0.10355,10.233266,0.142608
Topic7,11.03998,89.248515,8.672795,0.103298,1.020447,0.102162,0.101918,0.103705,0.10586,0.111832,...,0.118607,15.616988,168.744166,54.267998,0.252732,15.887032,0.113292,0.102026,0.109462,0.456546
Topic8,0.161943,7.441495,0.211621,2.001336,1.770109,0.102317,0.102607,0.101739,1.059646,0.104442,...,13.923,0.406436,138.661628,0.141585,4.174419,0.30022,0.104665,0.109028,1.330913,0.105551
Topic9,0.102404,51.02482,4.921801,15.50601,0.856515,0.102479,2.524751,0.102047,0.174812,0.103131,...,0.102784,0.94259,96.568847,0.120616,0.104714,0.301564,16.551021,2.843389,1.855875,0.108774


In [54]:
# Показать самые используемые ключевые слова для кадждой темы
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=15)
# Тема-ключевое слово БД
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,use,need,thank,recommend,app,year,news,date,download,want,story,enjoy,start,person,car
Topic 1,problem,look,way,picture,think,color,deal,work,order,understand,choice,episode,help,product,customer
Topic 2,game,play,level,fun,make,money,player,spend,start,thing,character,buy,think,graphic,control
Topic 3,screen,video,hate,change,guy,access,datum,enter,application,second,file,button,article,force,job
Topic 4,star,thing,make,photo,way,month,fix,lose,item,miss,track,content,share,help,bar
Topic 5,lot,help,book,price,check,food,option,learn,developer,choose,feature,information,stay,thank,bit
Topic 6,time,try,account,let,work,card,ask,need,log,device,waste,crash,watch,want,load
Topic 7,update,work,app,version,people,know,user,read,notification,option,review,page,want,stop,note
Topic 8,phone,make,day,money,send,minute,email,try,life,save,app,instal,site,receive,purchase
Topic 9,love,add,pay,say,fix,tell,want,thing,star,app,work,list,thank,load,like


In [55]:
# по возможности темы вводятся самостоятельно
Topics = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords["Topics"]=Topics
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Topics
Topic 0,use,need,thank,recommend,app,year,news,date,download,want,story,enjoy,start,person,car,Topic 0
Topic 1,problem,look,way,picture,think,color,deal,work,order,understand,choice,episode,help,product,customer,Topic 1
Topic 2,game,play,level,fun,make,money,player,spend,start,thing,character,buy,think,graphic,control,Topic 2
Topic 3,screen,video,hate,change,guy,access,datum,enter,application,second,file,button,article,force,job,Topic 3
Topic 4,star,thing,make,photo,way,month,fix,lose,item,miss,track,content,share,help,bar,Topic 4
Topic 5,lot,help,book,price,check,food,option,learn,developer,choose,feature,information,stay,thank,bit,Topic 5
Topic 6,time,try,account,let,work,card,ask,need,log,device,waste,crash,watch,want,load,Topic 6
Topic 7,update,work,app,version,people,know,user,read,notification,option,review,page,want,stop,note,Topic 7
Topic 8,phone,make,day,money,send,minute,email,try,life,save,app,instal,site,receive,purchase,Topic 8
Topic 9,love,add,pay,say,fix,tell,want,thing,star,app,work,list,thank,load,like,Topic 9


In [56]:
# Определите тему для данного текстового документа.
nlp = spacy.load('en', disable=['parser', 'ner'])
def predict_topic(text, nlp=nlp):
    global sent_to_words
    global lemmatization
    mytext_2 = list(sent_to_words(text))
    mytext_3 = lemmatization(mytext_2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    mytext_4 = vectorizer.transform(mytext_3)
    topic_probability_scores = best_lda_model.transform(mytext_4)
    topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), 1:14].values.tolist()
    infer_topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), -1]
    
    #topic_guess = df_topic_keywords.iloc[np.argmax(topic_probability_scores), Topics]
    return infer_topic, topic, topic_probability_scores
# Predict the topic
mytext = ["Experience with content management systems a major plus (any blogging counts!)Familiar with the Food52 editorial voice and aestheticLoves food, appreciates the importance of home cooking and cooking with the seasonsMeticulous editor, perfectionist, obsessive attention to detail, maddened by typos and broken links, delighted by finding and fixing themCheerful under pressureExcellent communication skillsA+ multi-tasker and juggler of responsibilities big and smallInterested in and engaged with social media like Twitter, Facebook, and PinterestLoves problem-solving and collaborating to drive Food52 forwardThinks big picture but pitches in on the nitty gritty of running a small company (dishes, shopping, administrative support)Comfortable with the realities of working for a startup: being on call on evenings and weekends, and working long hours"]
infer_topic, topic, prob_scores = predict_topic(text = mytext)
print(topic)
print(infer_topic)

['work', 'app', 'version', 'people', 'know', 'user', 'read', 'notification', 'option', 'review', 'page', 'want', 'stop']
Topic 7


In [58]:
# Тестирование.
data1 = df.Translated_Review.values.tolist()
list1=[]
list2=[]
list3=[]
text1=data1[0]
z=0
x=0
for txt in data1:
    text1=txt
    list1.append(text1)
    infer_topic, topic, prob_scores = predict_topic(text=list1)
    list2.append(int(infer_topic[6]))
    list1.clear()
for i in range(len(list2)):
    z=z+1
    if dominant_topic[i]==list2[i]:
        x=x+1
print(x,z)

29721 37427
