# Домашнее задание 2

In [384]:
#установка неоходимых модулей
#pip install gensim
#!pip install razdel
#!pip install pymorphy2

In [385]:
#импорт необходимых библиотек
import pandas as pd

#предобработка текстов
from gensim.corpora.dictionary import Dictionary
import re
import numpy as np
import nltk
nltk.download('stopwords')
#from nltk.tokenize import word_tokenize
from razdel import tokenize # https://github.com/natasha/razdel
import pymorphy2

#алгоритм тематического моделирования
#(каждый документ рассматривается как набор тем в определенной пропорци, 
#каждая тема -- набор ключевых слов в определенной пропорции.
#Алгоритм отображает распределение тем (гиперапараметр) в документах и 
#распределение ключевых слов по темам (набор доминирующих ключевых слов)
from gensim.models import LdaModel

#для сохранения модели
from gensim.test.utils import datapath

#модель линейной регрессии
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

#метрики качества
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

import matplotlib.pyplot as plt

import itertools

%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\BazhanovaEN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### 1. Загрузка данных

Загрузка датасета с новостями

In [386]:
news = pd.read_csv("articles.csv")
print(news.shape)
news.head(3)

(27000, 2)


Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


Загрузка датасета с пользователями

In [387]:
users = pd.read_csv("users_articles.csv")
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


#### 2. Векторные представления новостей

Создание стоп-слов

In [388]:
stopword_ru = stopwords.words('russian')
len(stopword_ru)

morph = pymorphy2.MorphAnalyzer()

In [389]:
with open('stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
stopword_ru += additional_stopwords
len(stopword_ru)

776

Функция очистки текста

In [390]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    
    #tokens = list(tokenize(text))
    #words = [_.text for _ in tokens]
    #words = [w for w in words if w not in stopword_ru]
    
    #return " ".join(words)
    return text

cache = {}

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

Применение функции очистки текста

In [391]:
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

Применение лемматизации текста

In [392]:
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)
news['title']

0        [заместитель, председатель, правительство, рф,...
1        [матч, финал, кубок, россия, футбол, приостано...
2        [форвард, авангард, томаш, заборский, прокомме...
3        [главный, тренер, кубань, юрий, красножанин, п...
4        [решение, попечительский, совет, владивостокск...
                               ...                        
26995    [учёный, токийский, университет, морской, наук...
26996    [глава, кафедра, отечественный, история, xx, в...
26997    [американский, учёный, уточнить, возраст, расп...
26998    [последний, год, тропический, углеродный, цикл...
26999    [жить, примерно, тыс, год, назад, территория, ...
Name: title, Length: 27000, dtype: object

In [393]:
#сформируем список текстов, разбив еще и на пробелы
texts = [t for t in news['title'].values]

# Create a corpus from a list of texts
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

In [394]:
#в скольких документах встречалось слово
common_dictionary[5], common_dictionary.dfs[5]

('банк', 1526)

In [395]:
#получили словарь из слов
#список списков из кортежей (id слова, число появлений слова в тексте)
common_corpus

[[(0, 5),
  (1, 2),
  (2, 1),
  (3, 2),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 4),
  (17, 1),
  (18, 1),
  (19, 4),
  (20, 1),
  (21, 1),
  (22, 3),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 2),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 2),
  (46, 1),
  (47, 2),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 3),
  (59, 1),
  (60, 2),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 3),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 3),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 1),
  (87, 1),
  (88, 1),
  (89, 1),
  (90, 1),
  (91, 1)

#### 3. Обучение модели

Обучение модели на словаре слов

In [396]:
lda = LdaModel(common_corpus, num_topics=25, id2word=common_dictionary)#, passes=10)

Просмотр получившихся тем

In [397]:
x=lda.show_topics(num_topics=25, num_words=7,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

#Слова, соответствующие теме
for topic,words in topics_words:
    print("topic_{}: ".format(topic)+" ".join(words))

topic_0: год который станция новый операция компания стать
topic_1: китай космос китайский турция иран nn санкция
topic_2: кость бомба мэй нация реактор джеймс занятость
topic_3: который год ребёнок это человек свой журнал
topic_4: это который мочь свой год говорить всё
topic_5: конкурс предприниматель бизнесмен сектор маршрут вдвое горный
topic_6: грунт таиланд су казахстан азербайджан вмф ток
topic_7: мозг активность доклад компьютер организм нервный втб
topic_8: nn район который человек мужчина город это
topic_9: авария предсказать индонезия мак nreutersn бин голубеть
topic_10: год это который весь млрд nn первый
topic_11: военный наука остров вода год смерть академия
topic_12: украина украинский метод пенсия киев тело полиция
topic_13: исследование это nn человек день год который
topic_14: рубль год правительство средство строительство закон российский
topic_15: обнаружить задержать уголовный след полицейский двигатель сотрудник
topic_16: сша россия российский газ это проект страна

Сохранение модели

In [398]:
temp_file = datapath("model.lda")
lda.save(temp_file)

# Загрузка модели
#lda = LdaModel.load(temp_file)

#### 4. Векторное представление новостей

Функция, возвращающая векторное предствление новостей

In [399]:
def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(25):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

Вектора новостей

In [400]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(25)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(25)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.0,0.025764,0.0,0.631213,0.0,0.0,0.0,0.0,0.0,...,0.0,0.264656,0.0,0.0,0.0,0.0,0.070657,0.0,0.0,0.0
1,4896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.4688,0.0,0.093733,0.068027,0.0,0.269514,0.0,0.079906,0.0
2,4897,0.0,0.0,0.025724,0.0,0.3409,0.0,0.0,0.0,0.050598,...,0.0,0.0,0.0,0.0,0.0,0.0,0.472736,0.0,0.0,0.0
3,4898,0.0,0.0,0.0,0.0,0.318323,0.0,0.0,0.0,0.0,...,0.0,0.371546,0.0,0.0,0.0,0.0,0.194501,0.0,0.0,0.0
4,4899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.149238,0.0,0.0,0.059742


#### 5. Векторное представление пользователей

In [401]:
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [402]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(25)]].values))
doc_dict

{6: array([0.        , 0.02576408, 0.        , 0.63121313, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.26465583, 0.        , 0.        , 0.        ,
        0.        , 0.07065655, 0.        , 0.        , 0.        ]),
 4896: array([0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.46880031, 0.        , 0.09373312, 0.06802675,
        0.        , 0.26951405, 0.        , 0.0799059 , 0.        ]),
 4897: array([0.        , 0.        , 0.02572438, 0.        , 0.34090018,
        0.        , 0.        , 0.        , 0.05059827, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.09220961,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.4727362

Функция, возвращающая вектор тем одного пользователя

In [403]:
def get_user_embedding(user_articles_list, method):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = method(user_vector, 0)
    return user_vector

In [404]:
#вектор тем для конкретного пользователя и с конкретным методом
user_articles_list = users['articles'].iloc[5]
get_user_embedding(user_articles_list, np.mean)

array([0.02404356, 0.02241845, 0.        , 0.17787779, 0.11477107,
       0.        , 0.        , 0.        , 0.08984884, 0.        ,
       0.18698725, 0.0304109 , 0.03021234, 0.10628097, 0.08465122,
       0.02178458, 0.08543151, 0.        , 0.        , 0.        ,
       0.00275091, 0.        , 0.        , 0.00963382, 0.        ])

Функция, возвращающая вектора тем всех пользователей

In [405]:
def all_user_embeddings(df_users, method):
    user_embeddings = pd.DataFrame([i for i in df_users['articles'].apply(lambda x: get_user_embedding(x, method), 1)])
    user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
    user_embeddings['uid'] = df_users['uid'].values
    user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
    
    return user_embeddings

In [406]:
#пример работы функции для конкретного метода
all_user_embeddings(users, np.mean).head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.130645,0.0,0.002056,0.330833,0.030376,0.0,0.0,0.0,0.076299,...,0.0,0.011618,0.027364,0.00251,0.00289,0.005176,0.019625,0.005961,0.0,0.0
1,u108690,0.00576,0.033206,0.0,0.211939,0.198386,0.002393,0.0,0.0,0.026144,...,0.020411,0.070931,0.0,0.021765,0.0,0.0,0.003866,0.0,0.022795,0.001799
2,u108339,0.061817,0.002611,0.0,0.167127,0.100577,0.0,0.0,0.0,0.06661,...,0.017843,0.043473,0.0,0.026885,0.0,0.0,0.012526,0.0,0.0,0.003877


#### 6. Прогнозирование оттока пользователей

In [407]:
#Загрузка датасета с целевой переменной
target = pd.read_csv("users_churn.csv")
target.head(3)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0


Функция прогнозирования оттока пользователей

In [408]:
def outflow_forecasting(df_users, df_target, method):
    #создание векторов тем для каждого пользователя
    user_embeddings = all_user_embeddings(df_users, method)
    
    #Создание датасета для модели
    X = pd.merge(user_embeddings, df_target, 'left')

    #Разделение датасета X на test и train
    X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)

    #Построение и обучение модели линейной регрессии
    logreg = LogisticRegression()
    logreg.fit(X_train, y_train)
    
    #Прогнозирование вероятности целевого класса на test
    preds = logreg.predict_proba(X_test)[:, 1]
    
    #рассчет метрик качества
    roc_auc = roc_auc_score(y_test, preds)
    precision, recall, thresholds = precision_recall_curve(y_test, preds)
    fscore = (2 * precision * recall) / (precision + recall)
    ix = np.argmax(fscore)

    return {'roc_auc_score': roc_auc, 'precision_score': precision[ix], 'recall_score': recall[ix], 'f_score': fscore[ix]}

Проверка

In [409]:
#Пример работы функции для конкретного метода
outflow_forecasting(users, target, np.mean)

{'roc_auc_score': 0.9525577068434212,
 'precision_score': 0.6354515050167224,
 'recall_score': 0.7755102040816326,
 'f_score': 0.6985294117647058}

In [410]:
methods = [np.mean, np.median, np.max]
methods_col = ['mean', 'median', 'max']
roc_auc_col = []
precision_score_col = []
recall_score_col = [] 
f_score_col = []

for method in methods:
    metrics = outflow_forecasting(users, target, method)
    roc_auc_col.append(metrics['roc_auc_score'])
    precision_score_col.append(metrics['precision_score'])
    recall_score_col.append(metrics['recall_score'])
    f_score_col.append(metrics['f_score'])

df = pd.DataFrame({'method': methods_col, 'roc_auc_score': roc_auc_col, 'precision_score': precision_score_col, 'recall_score': recall_score_col, 'f_score': f_score_col})
df

Unnamed: 0,method,roc_auc_score,precision_score,recall_score,f_score
0,mean,0.952558,0.635452,0.77551,0.698529
1,median,0.973782,0.723183,0.853061,0.782772
2,max,0.975957,0.762963,0.840816,0.8


**Вывод:** Лучшие показатели метрик у метода max, благодаря тому, что при составлении векторов тем пользователей выбирается наиболее вероятная тема.