### Домашнее задание

1. Самостоятельно разобраться с тем, что такое tfidf (документация https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html и еще - https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction)

In [1]:
%matplotlib inline

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords

from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.test.utils import datapath

from razdel import tokenize

import pymorphy2

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

from ipykernel import kernelapp as app

In [2]:
# nltk.download()

In [3]:
news = pd.read_csv('articles.csv')
users = pd.read_csv('users_articles.csv')

In [4]:
print(news.shape)
news.head(3)

(27000, 2)


Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


In [5]:
print(users.shape)
users.head(3)

(8000, 2)


Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [6]:
stopwords_ru = stopwords.words('russian')

In [7]:
with open('stopwords.txt') as f_sw:
    additional_stopwords = [w.strip() for w in f_sw.readlines() if w]
    
stopwords_ru += additional_stopwords

In [8]:
morph = pymorphy2.MorphAnalyzer()

In [9]:
def clean_text(text):
    """
    Очистка текста
    
    """
    
    if not isinstance(text, str):
        text = str(text)
        
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    
    text = re.sub('-\s\r\n\|-\s\r\n|\r\n', '', str(text))
    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    
    return text

In [10]:
cache = {}

def lemmatization(text):
    
    if not isinstance(text, str):
        text = str(text)
        
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]
    
    words_lem = []
    
    for w in words:
        if w[0] == '-':
            w = w[1:]
        
        if len(w) > 1:
            if w in cache:
                words_lem.append(cache[w])
                
            else:
                temp_cache = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cache)
                
    words_lem_without_stopwords = [i for i in words_lem if not i in stopwords_ru]
    
    return words_lem_without_stopwords

In [11]:
%%time
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

  


Wall time: 32.9 s


In [12]:
%%time
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

Wall time: 4min 23s


In [13]:
texts = [t for t in news['title'].values]

common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

In [14]:
%%time
lda = LdaModel(common_corpus, num_topics=25, id2word=common_dictionary)

Wall time: 1min 6s


In [15]:
temp_file = datapath('model.lda')
lda.save(temp_file)

lda = LdaModel.load(temp_file)

In [16]:
other_texts = [t for t in news['title'].iloc[:3]]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]

unseen_doc = other_corpus[2]

In [17]:
lda[unseen_doc]

[(0, 0.23747388), (10, 0.06304575), (20, 0.080527), (22, 0.6000956)]

In [18]:
x = lda.show_topics(num_topics=25, num_words=7, formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

In [19]:
for topic, words in topics_words:
    print(f'topic_{topic}: ' + ' '.join(words))

topic_0: восток фестиваль активность доклад германия северный рейс
topic_1: поверхность теория фирма эволюция вицепремьер установка ремонт
topic_2: проект строительство квартира планироваться область московский год
topic_3: год рост цена это ставка век тыс
topic_4: nn день час это вода nnn экипаж
topic_5: наука памятник фильм канал испытание исторический свердловский
topic_6: год это который россия страна сша новый
topic_7: это год который свой мочь человек говорить
topic_8: год продукция дональд соглашение сократиться млн сделка
topic_9: статья рак автор приложение стенка прага смит
topic_10: научный наука женщина игра искусство открытие хороший
topic_11: военный американский сша год великобритания восточный боевой
topic_12: снижение смерть земля звезда жизнь планета женщина
topic_13: ракета обнаружить который участок территория аэропорт год
topic_14: украина это рф российский который земля россия
topic_15: фонд суд взрыв год статья россия nn
topic_16: рубль год млрд станция россия ты

In [20]:
def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(25):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [21]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(25)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(25)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.029452,0.0,0.11887,0.018273,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4896,0.422398,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.471117,0.0,0.0,0.0,0.0,0.0,0.084472,0.0,0.0,0.0
2,4897,0.237507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.080547,0.0,0.600109,0.0,0.0
3,4898,0.133827,0.0,0.0,0.0,0.0,0.0,0.0,0.11474,0.0,...,0.0,0.0,0.0,0.071505,0.0,0.0,0.059127,0.382995,0.0,0.228787
4,4899,0.081824,0.0,0.082914,0.0,0.0,0.0,0.21384,0.0,0.061473,...,0.045453,0.0,0.0,0.0,0.0,0.0,0.204058,0.0,0.0,0.0


In [22]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(25)]].values))

In [23]:
def get_user_embedding(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.mean(user_vector, 0)
    return user_vector

In [24]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.080118,0.0,0.073572,0.03208,0.070169,0.03153,0.104424,0.204336,0.0,...,0.023761,0.025899,0.004683,0.006864,0.11759,0.05253,0.011774,0.034094,0.0,0.042064
1,u108690,0.012329,0.0,0.005613,0.013965,0.029276,0.005342,0.204973,0.247929,0.002408,...,0.051743,0.018055,0.023851,0.009545,0.051041,0.013104,0.037525,0.0478,0.0,0.080044
2,u108339,0.0,0.0,0.041086,0.003987,0.094729,0.005857,0.168417,0.036021,0.00323,...,0.056697,0.064029,0.029612,0.017537,0.054931,0.045105,0.0396,0.056878,0.017871,0.09212


In [25]:
target = pd.read_csv("users_churn.csv")
target.head(3)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0


In [26]:
X = pd.merge(user_embeddings, target, 'left')
X.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24,churn
0,u105138,0.080118,0.0,0.073572,0.03208,0.070169,0.03153,0.104424,0.204336,0.0,...,0.025899,0.004683,0.006864,0.11759,0.05253,0.011774,0.034094,0.0,0.042064,0
1,u108690,0.012329,0.0,0.005613,0.013965,0.029276,0.005342,0.204973,0.247929,0.002408,...,0.018055,0.023851,0.009545,0.051041,0.013104,0.037525,0.0478,0.0,0.080044,1
2,u108339,0.0,0.0,0.041086,0.003987,0.094729,0.005857,0.168417,0.036021,0.00323,...,0.064029,0.029612,0.017537,0.054931,0.045105,0.0396,0.056878,0.017871,0.09212,1


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)

In [28]:
logreg = LogisticRegression()
#обучим 
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [29]:
preds = logreg.predict_proba(X_test)[:, 1]

In [30]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.236349, F-Score=0.741, Precision=0.655, Recall=0.853


In [31]:
roc_auc = roc_auc_score(y_test, preds)

In [32]:
metrics = []
metrics.append((roc_auc, fscore[ix], precision[ix], recall[ix]))

### 2. Модифицировать код функции get_user_embedding таким образом, чтобы считалось не среднее (как в примере np.mean), а медиана. Применить такое преобразование к данным, обучить модель прогнозирования оттока и посчитать метрики качества и сохранить их: roc auc, precision/recall/f_score (для 3 последних - подобрать оптимальный порог с помощью precision_recall_curve, как это делалось на уроке)

In [33]:
def get_user_embedding(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.median(user_vector, 0)
    return user_vector

In [34]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.0,0.0,0.0,0.0,0.058506,0.02368,0.079041,0.168658,0.0,...,0.0,0.0,0.0,0.0,0.033468,0.0,0.0,0.022508,0.0,0.0
1,u108690,0.0,0.0,0.0,0.008158,0.011573,0.0,0.174486,0.237005,0.0,...,0.02454,0.0,0.0,0.0,0.046701,0.0,0.02469,0.035845,0.0,0.041038
2,u108339,0.0,0.0,0.005779,0.0,0.034392,0.0,0.149267,0.01683,0.0,...,0.057897,0.059519,0.032306,0.012566,0.047687,0.0553,0.028781,0.053856,0.0,0.073995


In [35]:
target = pd.read_csv("users_churn.csv")
target.head(3)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0


In [36]:
X = pd.merge(user_embeddings, target, 'left')
X.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24,churn
0,u105138,0.0,0.0,0.0,0.0,0.058506,0.02368,0.079041,0.168658,0.0,...,0.0,0.0,0.0,0.033468,0.0,0.0,0.022508,0.0,0.0,0
1,u108690,0.0,0.0,0.0,0.008158,0.011573,0.0,0.174486,0.237005,0.0,...,0.0,0.0,0.0,0.046701,0.0,0.02469,0.035845,0.0,0.041038,1
2,u108339,0.0,0.0,0.005779,0.0,0.034392,0.0,0.149267,0.01683,0.0,...,0.059519,0.032306,0.012566,0.047687,0.0553,0.028781,0.053856,0.0,0.073995,1


In [37]:
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)

In [38]:
logreg = LogisticRegression()
#обучим 
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [39]:
preds = logreg.predict_proba(X_test)[:, 1]

In [40]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.306329, F-Score=0.785, Precision=0.812, Recall=0.759


In [41]:
roc_auc = roc_auc_score(y_test, preds)

In [42]:
metrics.append((roc_auc, fscore[ix], precision[ix], recall[ix]))

### 3. Повторить п.2, но используя уже не медиану, а max

In [43]:
def get_user_embedding(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.max(user_vector, 0)
    return user_vector

In [44]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.480709,0.0,0.247295,0.119483,0.181759,0.079717,0.255364,0.533465,0.0,...,0.080575,0.084263,0.028099,0.041181,0.387437,0.274785,0.050942,0.099895,0.0,0.191294
1,u108690,0.073972,0.0,0.021875,0.033814,0.100396,0.032051,0.495772,0.397161,0.014448,...,0.13974,0.059542,0.143108,0.031682,0.130529,0.078623,0.102312,0.127716,0.0,0.315795
2,u108339,0.0,0.0,0.1564,0.02392,0.331827,0.035142,0.376975,0.142101,0.019377,...,0.11306,0.148765,0.070566,0.052314,0.145951,0.070442,0.146748,0.113737,0.094371,0.2211


In [45]:
target = pd.read_csv("users_churn.csv")
target.head(3)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0


In [46]:
X = pd.merge(user_embeddings, target, 'left')
X.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24,churn
0,u105138,0.480709,0.0,0.247295,0.119483,0.181759,0.079717,0.255364,0.533465,0.0,...,0.084263,0.028099,0.041181,0.387437,0.274785,0.050942,0.099895,0.0,0.191294,0
1,u108690,0.073972,0.0,0.021875,0.033814,0.100396,0.032051,0.495772,0.397161,0.014448,...,0.059542,0.143108,0.031682,0.130529,0.078623,0.102312,0.127716,0.0,0.315795,1
2,u108339,0.0,0.0,0.1564,0.02392,0.331827,0.035142,0.376975,0.142101,0.019377,...,0.148765,0.070566,0.052314,0.145951,0.070442,0.146748,0.113737,0.094371,0.2211,1


In [47]:
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)

In [48]:
logreg = LogisticRegression()
#обучим 
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [49]:
preds = logreg.predict_proba(X_test)[:, 1]

In [50]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.382366, F-Score=0.819, Precision=0.844, Recall=0.796


In [51]:
roc_auc = roc_auc_score(y_test, preds)

In [52]:
metrics.append((roc_auc, fscore[ix], precision[ix], recall[ix]))

4. (опциональное, если очень хочется) Воспользовавшись полученными знаниями из п.1, повторить пункт 2, но уже взвешивая новости по tfidf (подсказка: нужно получить веса-коэффициенты для каждого документа. Не все документы одинаково информативны и несут какой-то положительный сигнал). Подсказка 2 - нужен именно idf, как вес.

5. Сформировать на выходе единую таблицу, сравнивающую качество 3 разных метода получения эмбедингов пользователей: mean, median, max, idf_mean по метрикам roc_auc, precision, recall, f_score

In [53]:
df_metrics = pd.DataFrame(metrics, columns=['roc_auc_score', 'f-score', 'precision', 'recall'], index=['mean', 'median', 'max'])

In [54]:
df_metrics

Unnamed: 0,roc_auc_score,f-score,precision,recall
mean,0.965191,0.741135,0.655172,0.853061
median,0.976836,0.78481,0.812227,0.759184
max,0.979103,0.819328,0.844156,0.795918


### 6. Сделать самостоятельные выводы и предположения о том, почему тот или ной способ оказался эффективнее остальных