In [1]:
base_path = 'nlp_files'

## Text Preprocessing

In [2]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tauru\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from ast import literal_eval
import pandas as pd
import numpy as np
import re

In [4]:
def read_data(filename):
    data = pd.read_csv(filename, sep='\t')
    data['tags'] = data['tags'].apply(literal_eval)  # str to list
    return data

In [5]:
train = read_data(base_path+'/train.tsv')
validation = read_data(base_path+'/validation.tsv')
test = pd.read_csv(base_path+'/test.tsv', sep='\t')

In [6]:
train.head()

Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,[r]
1,mysql select all records where a datetime fiel...,"[php, mysql]"
2,How to terminate windows phone 8.1 app,[c#]
3,get current time in a specific country via jquery,"[javascript, jquery]"
4,Configuring Tomcat to Use SSL,[java]


In [7]:
X_train, y_train = train['title'].values, train['tags'].values
X_val, y_val = validation['title'].values, validation['tags'].values
X_test = test['title'].values

#### Task 1 (TextPrepare).
#### Implement the function text_prepare following the instructions.text_prepare

In [8]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    # lowercase text
    text = text.lower()
    # replace REPLACE_BY_SPACE_RE symbols by space in text 
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    # delete symbols which are in BAD_SYMBOLS_RE from text
    text = BAD_SYMBOLS_RE.sub('', text)
    # delete stopwords from 
    text = ' '.join(s for s in text.split() if s not in STOPWORDS)
    return text

In [9]:
def test_text_prepare():
    examples = ["SQL Server - any equivalent of Excel's CHOOSE function?",
                "How to free c++ memory vector<int> * arr?"]
    answers = ["sql server equivalent excels choose function", 
               "free c++ memory vectorint arr"]
    for ex, ans in zip(examples, answers):
        if text_prepare(ex) != ans:
            return "Wrong answer for the case: '%s'" % ex
    return 'Basic tests are passed.'

In [10]:
print(test_text_prepare())

Basic tests are passed.


#### Task 2 (WordsTagsCount)
#### Find 3 most popular tags and 3 most popular words in the train data 

In [11]:
# Dictionary of all tags from train corpus with their counts.
tags_counts = {}
# Dictionary of all words from train corpus with their counts.
words_counts = {}

#words_counts are dictionaries like {'some_word_or_tag': frequency}. 

for x in X_train:
    x = text_prepare(x)
    x_list = x.split()
    for word in x_list:
        if word not in words_counts:
            words_counts[word] = 0
        else:
            words_counts[word] += 1
for y in y_train:
    for tag in y:
        if tag not in tags_counts:
            tags_counts[tag] = 0
        else:
            tags_counts[tag] += 1


In [12]:
most_common_tags = sorted(tags_counts.items(), key=lambda x: x[1], reverse=True)[:3]
print(most_common_tags)
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:3]
print(most_common_words)

[('javascript', 19077), ('c#', 19076), ('java', 18660)]
[('using', 8277), ('php', 5613), ('java', 5500)]


## Transforming text to a vector
#### Machine Learning algorithms work with numeric data and we cannot use the provided text data "as is". There are many ways to transform text data to numeric vectors. In this task you will try to use two of them.

### Bag of words
#### One of the well-known approaches is a bag-of-words representation. To create this transformation, follow the steps:



1.   Find N most popular words in train corpus and numerate them. Now we have a dictionary of the most popular words.
2.  For each title in the corpora create a zero vector with the dimension equals to N.
3.  For each text in the corpora iterate over words which are in the dictionary and increase by 1 the corresponding coordinate.



In [13]:
# Implement the described encoding in the function my_bag_of_words with the size of the dictionary equals to 5000. 
DICT_SIZE = 5000
WORDS_TO_INDEX = {word: index for word, index in zip([key for key, value in sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:DICT_SIZE]], [i for i in range(DICT_SIZE)])}
INDEX_TO_WORDS = {index: word for word, index in WORDS_TO_INDEX.items()}
ALL_WORDS = WORDS_TO_INDEX.keys()
def my_bag_of_words(text, words_to_index, dict_size):
    """
        text: a string
        dict_size: size of the dictionary
        
        return a vector which is a bag-of-words representation of 'text'
    """
    result_vector = np.zeros(dict_size)
    for x in text.split(' '):
        if x in words_to_index.keys():
            result_vector[words_to_index[x]] += 1
    return result_vector

In [14]:
def test_my_bag_of_words():
    words_to_index = {'hi': 0, 'you': 1, 'me': 2, 'are': 3}
    examples = ['hi how are you']
    answers = [[1, 1, 0, 1]]
    for ex, ans in zip(examples, answers):
        if (my_bag_of_words(ex, words_to_index, 4) != ans).any():
            return "Wrong answer for the case: '%s'" % ex
    return 'Basic tests are passed.'

In [15]:
print(test_my_bag_of_words())

Basic tests are passed.


In [16]:
from scipy import sparse as sp_sparse

In [17]:
X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])
print('X_train shape ', X_train_mybag.shape)
print('X_val shape ', X_val_mybag.shape)
print('X_test shape ', X_test_mybag.shape)

X_train shape  (100000, 5000)
X_val shape  (30000, 5000)
X_test shape  (20000, 5000)


### Task 3 (BagOfWords). 
#### For the 11th row in X_train_mybag find how many non-zero elements it has.

In [18]:
num = 0
for i in X_train_mybag.toarray()[10]:
    if i != 0:
        num += 1
print(num)

6


### TF-IDF

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
def tfidf_features(X_train, X_val, X_test):
    """
        X_train, X_val, X_test — samples        
        return TF-IDF vectorized representation of each sample and vocabulary
    """
    # Create TF-IDF vectorizer with a proper parameters choice
    # Fit the vectorizer on the train set
    # Transform the train, test, and val sets and return the result
    
    
    tfidf_vectorizer = TfidfVectorizer(stop_words = STOPWORDS, max_features = 5000, token_pattern = r"([a-z0-9#+_]{1,})", min_df = 0.00007 )   
    
    X_train = tfidf_vectorizer.fit_transform(X_train)
    X_val = tfidf_vectorizer.transform(X_val)
    X_test = tfidf_vectorizer.transform(X_test)
    
    return X_train, X_val, X_test, tfidf_vectorizer.vocabulary_

In [21]:
X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_val, X_test)
tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}

check whether you have c++ or c# in your vocabulary, as they are obviously important tokens in our tags prediction task:

In [22]:
if 'c++' in tfidf_vocab.keys():
    print('c++')
if 'c#' in tfidf_vocab.keys():
    print('c#')

c++
c#


If you can't find it, we need to understand how did it happen that we lost them? It happened during the built-in tokenization of TfidfVectorizer. Luckily, we can influence on this process. Get back to the function above and use '(\S+)' regexp as a token_pattern in the constructor of the vectorizer.

## MultiLabel classifier


*   compare the quality of the bag-of-words and TF-IDF approaches 




In [23]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer

y_train_matrix = MultiLabelBinarizer().fit_transform(y_train)
y_val_matrix = MultiLabelBinarizer().fit_transform(y_val)

classifier_mybag = OneVsRestClassifier(LogisticRegression(penalty = 'l2', C = 4.0, max_iter=10000)).fit(X_train_mybag, y_train_matrix)
classifier_tfidf = OneVsRestClassifier(LogisticRegression(penalty = 'l2', C = 4.0, max_iter=10000)).fit(X_train_tfidf, y_train_matrix)

predicted_labels_mybag = classifier_mybag.predict(X_val_mybag)
predicted_labels_tfidf = classifier_tfidf.predict(X_val_tfidf)
predicted_scores_mybag = classifier_mybag.decision_function(X_val_mybag)
predicted_scores_tfidf = classifier_tfidf.decision_function(X_val_tfidf)

#### Evaluation


*   accuracy
*   F1-score macro/micro
* Precision macro/micro



In [24]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

def print_evaluation_scores(y_test, predicted):
    
    print('Accuracy: ', accuracy_score(y_test, predicted, normalize=False))
    print('F1-score macro: ', f1_score(y_test, predicted, average='macro'))
    print('F1-score micro: ', f1_score(y_test, predicted, average='micro'))
    print('Precision macro: ', average_precision_score(y_test, predicted, average='macro'))
    print('Precision micro: ', average_precision_score(y_test, predicted, average='micro'))

print('LogisticRegression:\n')
print('Bag-of-words\n')
print_evaluation_scores(y_val_matrix, predicted_labels_mybag)
print('\nTfidf\n')
print_evaluation_scores(y_val_matrix, predicted_labels_tfidf)

LogisticRegression:

Bag-of-words

Accuracy:  3222
F1-score macro:  0.21602547872639005
F1-score micro:  0.3136007181897394
Precision macro:  0.10720629514044663
Precision micro:  0.15542153307810525

Tfidf

Accuracy:  11164
F1-score macro:  0.5028549895745037
F1-score micro:  0.6802764662733876
Precision macro:  0.3432655061405315
Precision micro:  0.4924369337510154


## Word2Vec

#### ex: mean(word embeddings) --> MLP
#### ex: word embeddings --> LSTM

In [25]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout
from keras.models import Sequential
from gensim.models import Word2Vec
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_val = tokenizer.texts_to_sequences(X_val)
word_index = tokenizer.word_index
data_train = pad_sequences(sequences_train)
data_val = pad_sequences(sequences_val)
MLP_val = np.c_[data_val, np.array([0 for i in range(data_val.shape[0])]).T]
EMBEDDING_DIM = 500

sentence_train = [[word for word in text_prepare(words).split()] for words in X_train]
sentence_val = [[word for word in text_prepare(words).split()] for words in X_val]
w2v_model = Word2Vec(sentence_train, min_count = 1, size = EMBEDDING_DIM, workers = 3, window = 5)
embedding_layer = Embedding(w2v_model.wv.vectors.shape[0], w2v_model.wv.vectors.shape[1], weights=[w2v_model.wv.vectors])

Using TensorFlow backend.


In [26]:
#LSTM
model_LSTM = Sequential()
model_LSTM.add(embedding_layer)
model_LSTM.add(LSTM(w2v_model.wv.vectors.shape[1]))
model_LSTM.add(Dense(y_train_matrix.shape[1], activation = 'softmax'))  
model_LSTM.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 500)         15748500  
_________________________________________________________________
lstm_1 (LSTM)                (None, 500)               2002000   
_________________________________________________________________
dense_1 (Dense)              (None, 100)               50100     
Total params: 17,800,600
Trainable params: 17,800,600
Non-trainable params: 0
_________________________________________________________________


In [27]:
model_LSTM.compile(loss = 'categorical_crossentropy', optimizer = 'sgd', metrics = ['acc'])
model_LSTM.fit(data_train, y_train_matrix, epochs = 1, batch_size = 256)
loss_LSTM, acc_LSTM = model_LSTM.evaluate(data_val, y_val_matrix)
print('accuracy =', acc_LSTM)


Epoch 1/1
accuracy = 0.1152999997138977


In [28]:
#MLP
model_MLP = Sequential()
model_MLP.add(Dense(512, activation='relu'))
model_MLP.add(Dense(y_train_matrix.shape[1], activation = 'softmax'))

In [29]:
model_MLP.compile(loss = 'categorical_crossentropy', optimizer = 'sgd', metrics = ['accuracy'])
model_MLP.fit(data_train, y_train_matrix, epochs = 5, batch_size = 256)
loss_MLP, acc_MLP = model_MLP.evaluate(MLP_val, y_val_matrix)
print('accuracy =', acc_MLP)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
accuracy = 0.09399999678134918
