## Task2

Модифицировать код функции get_user_embedding таким образом, чтобы считалось не среднее (как в примере np.mean), а медиана. Применить такое преобразование к данным, обучить модель прогнозирования оттока и посчитать метрики качества и сохранить их: roc auc, precision/recall/f_score (для 3 последних - подобрать оптимальный порог с помощью precision_recall_curve, как это делалось на уроке)

**Импорт библиотек и инструментов**

In [271]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from razdel import tokenize
import pymorphy2
np.seterr(invalid='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'ignore'}

In [272]:
import re
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.test.utils import datapath

In [273]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt

%matplotlib inline

In [274]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

**Загрузка данных**

In [275]:
news = pd.read_csv('materials.csv')
news.head(3)

Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


In [276]:
users = pd.read_csv("users_articles.csv")
users.head(10)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"
3,u101138,"[5933, 6186, 5055, 6977, 5206, 488389]"
4,u108248,"[707, 1144, 2532, 2928, 3133, 324592]"
5,u106662,"[323868, 323426, 324267, 322426, 324104, 1550]"
6,u105949,"[293138, 294471, 295012, 294736, 293949, 3544]"
7,u102457,"[6928, 5009, 6940, 7629, 7644, 512736]"
8,u104124,"[322838, 324699, 322991, 322120, 324327, 472331]"
9,u101386,"[7827, 6427, 7394, 7151, 6335, 487254]"


**Дополняем стоп-слова, очищаем текст от стоп-слов, символов, лемматизируем**

In [277]:
stopwords_ru = stopwords.words('russian')

In [278]:
with open('stopwords.txt', encoding = 'UTF-8') as f:
    add_stopwords = [w.strip() for w in f.readlines() if w]
stopwords_ru += add_stopwords   

In [279]:
morph = pymorphy2.MorphAnalyzer()

In [280]:
def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    text = re.sub('n', ' ', text)
    
    return text

cache = {}

def lemmatization(text):
    if not isinstance(text, str):
        text = str(text)
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]
    words_lem = []
    for w in words:
        if w[0] == '-':
            w = w[1:]
        if len(w) > 1:
            if w in cache:
                words_lem.append(cache[w])
            else:
                temp_cache = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cache)
    words_lem_without_stopwords = [w for w in words_lem if not w in stopwords_ru]
    
    return words_lem_without_stopwords

In [281]:
%%time
#Очистка текста
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

CPU times: total: 1min 3s
Wall time: 1min 5s


In [282]:
%%time
#Лемматизация текста
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

CPU times: total: 15min 42s
Wall time: 16min 9s


**Doc2Bow**

In [283]:
all_texts = [t for t in news['title'].values]

common_dictionary = Dictionary(all_texts)
common_corpus = [common_dictionary.doc2bow(text) for text in all_texts]

In [284]:
%%time
lda = LdaModel(common_corpus, num_topics = 20, id2word=common_dictionary)

CPU times: total: 58.8 s
Wall time: 1min 30s


In [285]:
temp_file = datapath('model.lda')
lda.save(temp_file)

In [286]:
lda = LdaModel.load(temp_file)

**Разбиваем тексты по темам**

In [287]:
def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))
    
    output_vector = []
    for i in range(20):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [288]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(20)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(20)]]
topic_matrix.head()

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,6,0.0,0.040939,0.0,0.860255,0.0,0.0,0.0,0.078457,0.0,...,0.0,0.0,0.0,0.0,0.012478,0.0,0.0,0.0,0.0,0.0
1,4896,0.508708,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.392924,0.0,0.0,0.0,0.0,0.0,0.0,0.075989,0.0
2,4897,0.071372,0.0,0.0,0.0,0.0,0.0,0.0,0.14146,0.0,...,0.312906,0.0,0.0,0.0,0.0,0.0,0.0,0.452655,0.0,0.0
3,4898,0.143635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.338997,...,0.445054,0.0,0.0,0.0,0.0,0.0,0.0,0.06202,0.0,0.0
4,4899,0.067217,0.106137,0.0,0.703793,0.0,0.0,0.099292,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [289]:
doc_dict = dict(zip(topic_matrix['doc_id'].values,topic_matrix[['topic_{}'.format(i) for i in range(20)]].values))

**Характеризуем пользователей интересующими их темами**<br>
*с разными усреднениями(среднее, медиана, макс)*

In [290]:
def get_user_embedding_mean(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.mean(user_vector, 0)
    return user_vector

In [291]:
def get_user_embedding_med(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.median(user_vector, 0)
    return user_vector

In [292]:
def get_user_embedding_max(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.max(user_vector, 0)
    return user_vector

In [293]:
user_embeddings_mean = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding_mean(x),1)])
user_embeddings_mean.columns = ['topic_{}'.format(i) for i in range(20)]
user_embeddings_mean['uid'] = users['uid'].values
user_embeddings_mean = user_embeddings_mean[['uid']+['topic_{}'.format(i) for i in range(20)]]

In [294]:
user_embeddings_med = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding_med(x),1)])
user_embeddings_med.columns = ['topic_{}'.format(i) for i in range(20)]
user_embeddings_med['uid'] = users['uid'].values
user_embeddings_med = user_embeddings_med[['uid']+['topic_{}'.format(i) for i in range(20)]]

In [295]:
user_embeddings_max = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding_max(x),1)])
user_embeddings_max.columns = ['topic_{}'.format(i) for i in range(20)]
user_embeddings_max['uid'] = users['uid'].values
user_embeddings_max = user_embeddings_max[['uid']+['topic_{}'.format(i) for i in range(20)]]

In [296]:
target = pd.read_csv("users_churn.csv")
X_mean = pd.merge(user_embeddings_mean, target, 'left')
X_med = pd.merge(user_embeddings_med, target, 'left')
X_max = pd.merge(user_embeddings_max, target, 'left')

**Расчёт метрик для среднего**

In [297]:
X_train, X_test, y_train, y_test = train_test_split(X_mean[['topic_{}'.format(i) for i in range(20)]], 
                                                    X_mean['churn'], random_state=26)

In [298]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression()

In [299]:
preds = logreg.predict_proba(X_test)[:,1]

In [300]:
precision_mean, recall_mean, thresholds_mean = precision_recall_curve(y_test, preds)
fscore_mean = (2 * precision_mean * recall_mean) / (precision_mean + recall_mean)
fscore_mean = np.nan_to_num(fscore_mean)
ix = np.argmax(fscore_mean)
best_fscore_mean =  round(fscore_mean[ix],3)
best_precision_mean =  round(precision_mean[ix],3)
best_recall_mean =  round(recall_mean[ix],3)
rocauc_mean =  round(roc_auc_score(y_test, preds),3)

**Расчёт метрик для медианы**

In [301]:
X_train, X_test, y_train, y_test = train_test_split(X_med[['topic_{}'.format(i) for i in range(20)]], 
                                                    X_med['churn'], random_state=26)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
preds = logreg.predict_proba(X_test)[:,1]

In [302]:
precision_med, recall_med, thresholds_med = precision_recall_curve(y_test, preds)
fscore_med = (2 * precision_med * recall_med) / (precision_med + recall_med)
fscore_med = np.nan_to_num(fscore_med)
ix = np.argmax(fscore_med)
best_fscore_med =  round(fscore_med[ix],3)
best_precision_med =  round(precision_med[ix],3)
best_recall_med =  round(recall_med[ix],3)
rocauc_med =  round(roc_auc_score(y_test, preds),3)

**Расчёт метрик для максимума**

In [303]:
X_train, X_test, y_train, y_test = train_test_split(X_max[['topic_{}'.format(i) for i in range(20)]], 
                                                    X_max['churn'], random_state=26)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
preds = logreg.predict_proba(X_test)[:,1]

In [304]:
precision_max, recall_max, thresholds_max = precision_recall_curve(y_test, preds)
fscore_max = (2 * precision_max * recall_max) / (precision_max + recall_max)
fscore_max = np.nan_to_num(fscore_max)
ix = np.argmax(fscore_max)
best_fscore_max = round(fscore_max[ix],3)
best_precision_max =  round(precision_max[ix],3)
best_recall_max =  round(recall_max[ix],3)
rocauc_max =  round(roc_auc_score(y_test, preds),3)

**Объединяем результаты в одну таблицу**

In [305]:
table = {'Metrics':['F-Score', 'Precision', 'Recall', 'Roc Auc'],\
         'Mean':[best_fscore_mean,best_precision_mean,best_recall_mean,rocauc_mean],\
        'Median':[best_fscore_med,best_precision_med,best_recall_med,rocauc_med],\
        'Max':[best_fscore_max,best_precision_max,best_recall_max,rocauc_max]}

In [306]:
pd.pivot_table(pd.DataFrame(table),
               index=["Metrics"])

Unnamed: 0_level_0,Max,Mean,Median
Metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
F-Score,0.758,0.56,0.629
Precision,0.745,0.425,0.571
Recall,0.771,0.821,0.7
Roc Auc,0.966,0.909,0.943


*Лучшие результаты(F-score) - для расчётов по максимуму. Возможное объяснение: учитываются ВСЕ интересующие пользователя темы с достаточным весом. При использовании среднего/медианы вес темы очень занизится, даже если он достаточно высок в 1-2 статьях. Модель может посчитать, что тема не интересна пользователю, раз он прочитал только 1 статью, хотя это может быть не так.*<br><br>
*Медиана показывает лучшие результаты, чем среднее, т.к. данных не так много, следовательно распределение будет далеко от нормального. А значит и среднее слишком смещено от медианы*<br><br>
*Если смотреть отдельно по метрикам, у среднего полнота лучшая, т.к. из усреднения берется всё подряд, ничего лишнего не упускается, зато из-за этого сильно страдает точность, тут max работает аккуратнее* 

**Посчитаем вес каждого документа по популярности слов, встречающихся в этом документе; просуммируем idf слов в каждом документе - так получим вес документа; умножим полученный ранее вектор для документа на его "уникальность"(суммарный idf).**<br>
*Для вектора пользователя будем использовать максимум как самый успешный показатель из прошлых испытаний*

In [307]:
tf_idf = TfidfVectorizer(encoding='utf-8')

In [308]:
#Очищенный и лемматизированный текст в виде отдельных слов соберем обратно в текст
news_tfidf = news.copy(deep=True)
temp = ''
for  i in range(news_tfidf.shape[0]):
    for k in range(len(news_tfidf['title'][i])):
        temp += news_tfidf['title'][i][k] + ' '
    news_tfidf['title'][i] = temp
    temp = ''

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_tfidf['title'][i] = temp


In [309]:
# Векторизуем все тексты
texts_tfidf = [t for t in news_tfidf['title']]
X = tf_idf.fit_transform(texts_tfidf)

In [310]:
# Добавим к датафрейму значение idf для каждого документа (сумма idf его слов)
news_tfidf['idf'] = 0
for i in range(X.shape[0]):
    news_tfidf['idf'][i] = round(X[i].toarray().sum(), 5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_tfidf['idf'][i] = round(X[i].toarray().sum(), 5)


In [311]:
# Добавим значение idf в матрицу тем
topic_matrix['idf'] = news_tfidf['idf'].values
topic_matrix = topic_matrix[['doc_id']+['idf']+['topic_{}'.format(i) for i in range(20)]]
topic_matrix.head(3)

Unnamed: 0,doc_id,idf,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,6,8,0.0,0.040939,0.0,0.860255,0.0,0.0,0.0,0.078457,...,0.0,0.0,0.0,0.0,0.012478,0.0,0.0,0.0,0.0,0.0
1,4896,5,0.508708,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.392924,0.0,0.0,0.0,0.0,0.0,0.0,0.075989,0.0
2,4897,5,0.071372,0.0,0.0,0.0,0.0,0.0,0.0,0.14146,...,0.312906,0.0,0.0,0.0,0.0,0.0,0.0,0.452655,0.0,0.0


In [312]:
# Умножим полученные вероятности тем на idf 
for i in doc_dict.keys():
    tempidf = topic_matrix.loc[topic_matrix['doc_id']==i, 'idf']
    doc_dict[i] *= float(tempidf)

In [313]:
def get_user_embedding_max_idf(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.max(user_vector, 0)
    return user_vector

In [314]:
# Получим вектор пользователя (уже с учетом idf)
user_embeddings_max_idf = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding_max_idf(x),1)])
user_embeddings_max_idf.columns = ['topic_{}'.format(i) for i in range(20)]
user_embeddings_max_idf['uid'] = users['uid'].values
user_embeddings_max_idf = user_embeddings_max_idf[['uid']+['topic_{}'.format(i) for i in range(20)]]

In [315]:
X_max_idf = pd.merge(user_embeddings_max_idf, target, 'left')

In [316]:
X_train, X_test, y_train, y_test = train_test_split(X_max_idf[['topic_{}'.format(i) for i in range(20)]], 
                                                    X_max_idf['churn'], random_state=26)
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
preds = logreg.predict_proba(X_test)[:,1]

In [317]:
precision_max_idf, recall_max_idf, thresholds_max_idf = precision_recall_curve(y_test, preds)
fscore_max_idf = (2 * precision_max_idf * recall_max_idf) / (precision_max_idf + recall_max_idf)
fscore_max_idf = np.nan_to_num(fscore_max_idf)
ix = np.argmax(fscore_max_idf)
best_fscore_max_idf = round(fscore_max_idf[ix],3)
best_precision_max_idf =  round(precision_max_idf[ix],3)
best_recall_max_idf =  round(recall_max_idf[ix],3)
rocauc_max_idf =  round(roc_auc_score(y_test, preds),3)

In [318]:
# Вносим полученные метрики в таблицу
table['Max_idf'] = [best_fscore_max_idf,best_precision_max_idf,best_recall_max_idf,rocauc_max_idf]

In [319]:
pd.pivot_table(pd.DataFrame(table),
               index=["Metrics"])

Unnamed: 0_level_0,Max,Max_idf,Mean,Median
Metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
F-Score,0.758,0.817,0.56,0.629
Precision,0.745,0.846,0.425,0.571
Recall,0.771,0.789,0.821,0.7
Roc Auc,0.966,0.983,0.909,0.943


**Учет уникальности статьи заметно увеличил значения метрик, это самые лучшие результаты**<br>
*Возможная причина - отсеяли "мусор", очень популярные статьи, которые в любом случае читают многие, т.е. они не влияют на отписку**