### Импорты

In [69]:
import numpy as np
import pandas as pd
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
import re
import numpy as np
from nltk.corpus import stopwords
from razdel import tokenize
import pymorphy2
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import precision_score, classification_report
from sklearn.metrics import precision_recall_curve, confusion_matrix
import matplotlib.pyplot as plt

%matplotlib inline

### Открытие датасета

In [4]:
news = pd.read_csv("articles.csv")
print(news.shape)
news.head(3)

(27000, 2)


Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


In [7]:
users = pd.read_csv("users_articles.csv")
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [36]:
target = pd.read_csv("users_churn.csv")
target.head(3)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0


### Подготовка

In [21]:
# import nltk
# nltk.download('stopwords')

In [22]:
stopword_ru = stopwords.words('russian')
len(stopword_ru)

morph = pymorphy2.MorphAnalyzer()

In [24]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    
    #tokens = list(tokenize(text))
    #words = [_.text for _ in tokens]
    #words = [w for w in words if w not in stopword_ru]
    
    #return " ".join(words)
    return text

cache = {}

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [25]:
%%time
#Запускаем очистку текста. Будет долго...
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

  text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)


CPU times: user 12.9 s, sys: 336 ms, total: 13.2 s
Wall time: 13.3 s


In [26]:
%%time
#Запускаем лемматизацию текста. Будет очень долго...
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

CPU times: user 1min 12s, sys: 823 ms, total: 1min 13s
Wall time: 1min 51s


In [27]:
texts = [t for t in news['title'].values]

# Create a corpus from a list of texts
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

In [28]:
lda = LdaModel(common_corpus, num_topics=25, id2word=common_dictionary)#, passes=10)

In [29]:
def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(25):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [30]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(25)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(25)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.0,0.0,0.0,0.066596,0.0,0.0,0.0,0.0,0.0,...,0.797097,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.128634,0.0
1,4896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4897,0.093197,0.062551,0.02466,0.398822,0.0,0.114165,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.165319,0.0,0.0
3,4898,0.425515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.10069,0.0,0.0,0.0,0.0,0.0,0.0,0.28818,0.0,0.017398
4,4899,0.0,0.0,0.0,0.171076,0.0,0.0,0.0,0.0,0.0,...,0.0,0.315893,0.0,0.0,0.0,0.0,0.087725,0.0,0.401949,0.0


In [174]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(25)]].values))
doc_dict[33]

array([0.        , 0.39799124, 0.        , 0.        , 0.        ,
       0.03602912, 0.        , 0.        , 0.05673615, 0.        ,
       0.11880378, 0.15736015, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.10814004, 0.        , 0.        , 0.11449413, 0.        ])

### Задания 2-5

In [172]:
def get_user_embedding(user_articles_list, func='mean', idf=False):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    if func == 'median':
        user_vector = np.median(user_vector, 0)
    elif func == 'max':
        user_vector = np.max(user_vector, 0)
    else:
        user_vector = np.mean(user_vector, 0)
    
    if idf:
        # Вот тут я уже ничего не понимаю
        user_vector = np.log((np.array([i for i in range(user_vector.shape[0])]) + 1) / (user_vector + 1)) + 1
    return user_vector

In [82]:
user_articles_list = users['articles'].iloc[33]
get_user_embedding(user_articles_list)

array([0.17296669, 0.45980501, 0.        , 0.13916734, 0.03140742,
       0.13879293, 0.01513906, 0.        , 0.27159163, 0.02232163,
       0.32300961, 0.05382033, 0.        , 0.04332254, 0.        ,
       0.11115185, 0.        , 0.34048802, 0.        , 0.        ,
       0.        , 0.        , 0.08279449, 0.54246598, 0.        ])

In [178]:
def fit_predict(agg_func, idf=False):
    user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x, agg_func, idf), 1)])
    user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
    user_embeddings['uid'] = users['uid'].values
    user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
    
    X = pd.merge(user_embeddings, target, 'left')
    X_train, X_test, y_train, y_test = train_test_split(X.drop(['churn', 'uid'], axis=1),
                                                        X['churn'], random_state=0)
    
    logreg = LogisticRegression(solver='liblinear')
    logreg.fit(X_train, y_train)
    
    y_pred_test = logreg.predict_proba(X_test)[:, 1]
    
    metrics = pd.DataFrame()
    metrics['Function'] = pd.Series([agg_func])
    metrics['Weighted'] = pd.Series(['Yes' if idf else 'No'])
    
    metrics['ROC-AUC'] = pd.Series([roc_auc_score(y_test, y_pred_test)])
    
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_test)
    fscore = (2 * precision * recall) / (precision + recall)
    ix = np.argmax(fscore)
    
    metrics['F-score'] = pd.Series([fscore[ix]])
    metrics['Precision'] = pd.Series([precision[ix]])
    metrics['Recall'] = pd.Series([recall[ix]])
    return metrics

In [123]:
mean_metrics = fit_predict('mean')
mean_metrics

Unnamed: 0,Function,ROC-AUC,F-score,Precision,Recall
0,mean,0.946639,0.684887,0.564987,0.869388


In [179]:
metrics_table = pd.concat([fit_predict('mean'), fit_predict('median'), fit_predict('max'), fit_predict('mean', idf=True)], ignore_index=True)
metrics_table

Unnamed: 0,Function,Weighted,ROC-AUC,F-score,Precision,Recall
0,mean,No,0.946569,0.679612,0.563003,0.857143
1,median,No,0.9532,0.704797,0.643098,0.779592
2,max,No,0.965414,0.761062,0.830918,0.702041
3,mean,Yes,0.950034,0.691558,0.574124,0.869388


### Задание 6

Я считаю, что __max__ оказался эффективней, потому что... честно говоря, у меня нет предположений на этот счет.

__TODO:__ Посмотреть еще раз тот момент в вебинаре, где преподаватель рассказывает про topic_matrix, чтобы лучше разобраться, как работает эмбединг.