In [None]:
#importing libraries
import pandas as pd
import numpy as np
import nltk, re
nltk.download('stopwords') # load english stopwords
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter("ignore")
warnings.warn("deprecated", DeprecationWarning)
warnings.simplefilter("ignore")

In [2]:
#dataset is available at https://www.kaggle.com/badalgupta/stack-overflow-tag-prediction
dataset = pd.read_csv('/Users/Vikram/Desktop/train.csv')
print(dataset.shape)

# 70-30% random split of dataset
X_train, X_test, y_train, y_test = train_test_split(dataset['title'].values, dataset['tags'].values, test_size=0.3, random_state=42)
dataset.head()

(100000, 2)


Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,['r']
1,mysql select all records where a datetime fiel...,"['php', 'mysql']"
2,How to terminate windows phone 8.1 app,['c#']
3,get current time in a specific country via jquery,"['javascript', 'jquery']"
4,Configuring Tomcat to Use SSL,['java']


In [3]:
#We preprocess the data using this function that removes weird tokens that may have formed if we use the dataset as is
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = list((stopwords.words('english')))

def text_prepare(text,join_sumbol):
    """
        text: a string
        
        return: modified initial string
    """
    # lowercase text
    text = text.lower() 

    # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = re.sub(REPLACE_BY_SPACE_RE," ",text,)

    # delete symbols which are in BAD_SYMBOLS_RE from text
    text = re.sub(BAD_SYMBOLS_RE,"",text)
    text = re.sub(r'\s+'," ",text)

    # delete stopwords from text
    text = f'{join_sumbol}'.join([i for i in text.split() if i not in STOPWORDS])
    
    return text

tests = ["SQL Server - any equivalent of Excel's CHOOSE function?",
        "How to free c++ memory vector<int> * arr?"]
for test in tests: print(text_prepare(test,' '))

sql server equivalent excels choose function
free c++ memory vectorint arr


In [4]:
#we preprocess the data now
X_train = [text_prepare(x,' ') for x in X_train]
X_test = [text_prepare(x,' ') for x in X_test]
y_train = [text_prepare(x,',') for x in y_train]
y_test = [text_prepare(x,',') for x in y_test]

In [29]:
X_train[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [28]:
y_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [7]:
#we find the most popular tags and words
from collections import Counter
from itertools import chain

# Dictionary of all tags from train corpus with their counts.
tags_counts = Counter(chain.from_iterable([i.split(",") for i in y_train]))

# Dictionary of all words from train corpus with their counts.
words_counts = Counter(chain.from_iterable([i.split(" ") for i in X_train]))

top_10_most_common_tags = sorted(tags_counts.items(), key=lambda x: x[1], reverse=True)[:10]
top_10_most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:10]

print(f"Top three most popular tags are: {','.join(tag for tag, _ in top_10_most_common_tags)}")
print(f"Top three most popular words are: {','.join(tag for tag, _ in top_10_most_common_words)}")

Top three most popular tags are: javascript,c#,java,php,python,jquery,c++,html,objectivec,aspnet
Top three most popular words are: using,php,java,file,javascript,get,error,c#,python,string


In [8]:
#we are using the bag-of-words representation here
#we enumerate n most popular words
# We considered only the top 5,000 words
DICT_SIZE = 5000
WORDS_TO_INDEX = {j[0]:i for i,j in enumerate(sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:DICT_SIZE])}
INDEX_TO_WORDS = {i:j[0] for i,j in enumerate(sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:DICT_SIZE])}
ALL_WORDS = WORDS_TO_INDEX.keys()

def my_bag_of_words(text, words_to_index, dict_size):
    """
        text: a string
        dict_size: size of the dictionary
        
        return a vector which is a bag-of-words representation of 'text'
    """
    result_vector = np.zeros(dict_size)
    keys= [words_to_index[i] for i in text.split(" ") if i in words_to_index.keys()]
    result_vector[keys]=1
    return result_vector

In [9]:
#now we implement the above function
from scipy import sparse as sp_sparse
#scipy is used to create csr matrix which is allowed for sklearn
X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])
print('X_train shape ', X_train_mybag.shape)
print('X_test shape ', X_test_mybag.shape)

X_train shape  (70000, 5000)
X_test shape  (30000, 5000)


In [10]:
#we remove the outliers in our dataset using term frequency-inverse document frequency
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_features(X_train, X_test):
    """
        X_train, X_val, X_test — samples        
        return bag-of-words representation of each sample and vocabulary
    """
    # Create TF-IDF vectorizer with a proper parameters choice
    # Fit the vectorizer on the train set
    # Transform the train, test, and val sets and return the result
    
    
    tfidf_vectorizer = TfidfVectorizer(X_train,ngram_range=(1,2),max_df=0.9,min_df=5,token_pattern=r'(\S+)' )
    tfidf_vectorizer.fit(X_train)
    X_train = tfidf_vectorizer.transform(X_train)
    X_test = tfidf_vectorizer.transform(X_test)
    
    return X_train, X_test, tfidf_vectorizer.vocabulary_

X_train_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_test)
tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}

In [11]:
#we check if the snippet is from c# or c++ since the two have different terms 
print("c#" in set(tfidf_reversed_vocab.values()))
print("c++" in set(tfidf_reversed_vocab.values()))

True
True


In [27]:
y_train[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [13]:
#we see that each example has multiple tags
#therefore we convert it to binary
#first transform to dictionary
y_train = [set(i.split(',')) for i in y_train]
y_test = [set(i.split(',')) for i in y_test]

In [14]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
y_test = mlb.fit_transform(y_test)

In [15]:
# For multiclass classification
from sklearn.multiclass import OneVsRestClassifier

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from lightgbm import LGBMClassifier

def train_classifier(X_train, y_train, X_valid=None, y_valid=None, C=1.0, model='lr'):
    """
      X_train, y_train — training data
      
      return: trained classifier
      
    """
    
    if model=='lr':
        model = LogisticRegression(C=C, penalty='l1', dual=False, solver='liblinear')
        model = OneVsRestClassifier(model)
        model.fit(X_train, y_train)
    
    elif model=='svm':
        model = LinearSVC(C=C, penalty='l1', dual=False, loss='squared_hinge')
        model = OneVsRestClassifier(model)
        model.fit(X_train, y_train)
    
    elif model=='nbayes':
        model = MultinomialNB(alpha=1.0)
        model = OneVsRestClassifier(model)
        model.fit(X_train, y_train)
        
    elif model=='lda':
        model = LinearDiscriminantAnalysis(solver='svd')
        model = OneVsRestClassifier(model)
        model.fit(X_train, y_train)

    return model

# Train the classifiers for different data transformations: bag-of-words and tf-idf.

# Linear NLP model using bag of words approach
%time classifier_mybag = train_classifier(X_train_mybag, y_train, C=1.0, model='lr')

# Linear NLP model using TF-IDF approach
%time classifier_tfidf = train_classifier(X_train_tfidf, y_train, C=1.0, model='lr')

Wall time: 53.1 s
Wall time: 1min 1s


In [16]:
#we make predictions for data
y_test_predicted_labels_mybag = classifier_mybag.predict(X_test_mybag)

y_test_predicted_labels_tfidf = classifier_tfidf.predict(X_test_tfidf)

In [17]:
#we test the above predictions
y_test_pred_inversed = mlb.inverse_transform(y_test_predicted_labels_tfidf)
y_test_inversed = mlb.inverse_transform(y_test)
for i in range(3):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_test[i],
        ','.join(y_test_inversed[i]),
        ','.join(y_test_pred_inversed[i])
    ))

Title:	making auto timeout stdcin statement
True labels:	c++,linux,multithreading
Predicted labels:	


Title:	penalty using char variables cuda kernels
True labels:	c,c++,performance
Predicted labels:	c,c++


Title:	sqlalchemy trying eager loading attribute error
True labels:	python
Predicted labels:	python




In [18]:
#we evaluate our model on accuracy, F1 score and precision
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

from functools import partial
def print_evaluation_scores(y_val, predicted):
    f1_score_macro = partial(f1_score,average="macro")
    f1_score_micro = partial(f1_score,average="micro")
    f1_score_weighted = partial(f1_score,average="weighted")
    
    average_precision_score_macro = partial(average_precision_score,average="macro")
    average_precision_score_micro = partial(average_precision_score,average="micro")
    average_precision_score_weighted = partial(average_precision_score,average="weighted")
    
    scores = [accuracy_score,f1_score_macro,f1_score_micro,f1_score_weighted,average_precision_score_macro,
             average_precision_score_micro,average_precision_score_weighted]
    for score in scores:
        print(score,score(y_val,predicted))

print('Bag-of-words')
print_evaluation_scores(y_test, y_test_predicted_labels_mybag)
print('Tfidf')
print_evaluation_scores(y_test, y_test_predicted_labels_tfidf)

Bag-of-words
<function accuracy_score at 0x00000191ED421730> 0.35963333333333336
functools.partial(<function f1_score at 0x00000191ED421B70>, average='macro') 0.5175703397059582
functools.partial(<function f1_score at 0x00000191ED421B70>, average='micro') 0.6737210482175953
functools.partial(<function f1_score at 0x00000191ED421B70>, average='weighted') 0.6519454861228964
functools.partial(<function average_precision_score at 0x00000191ED3FCD08>, average='macro') 0.3564444300835289
functools.partial(<function average_precision_score at 0x00000191ED3FCD08>, average='micro') 0.4827362533826876
functools.partial(<function average_precision_score at 0x00000191ED3FCD08>, average='weighted') 0.512347381109696
Tfidf
<function accuracy_score at 0x00000191ED421730> 0.3526
functools.partial(<function f1_score at 0x00000191ED421B70>, average='macro') 0.5071887613085979
functools.partial(<function f1_score at 0x00000191ED421B70>, average='micro') 0.6657462348365882
functools.partial(<function f1_s

In [19]:
#we use weighted F1 and also L1 and L2 regularisation in logistic regression with different coeff
import matplotlib.pyplot as plt

hypers = np.arange(0.1, 1.1, 0.1)
res = []

for h in hypers:
    temp_model = train_classifier(X_train_tfidf, y_train, C=h, model='lr')
    temp_pred = f1_score(y_test, temp_model.predict(X_test_tfidf), average='weighted')
    res.append(temp_pred)

plt.figure(figsize=(7,5))
plt.plot(hypers, res, color='blue', marker='o')
plt.grid(True)
plt.xlabel('Parameter $C$')
plt.ylabel('Weighted F1 score')
plt.show()

<Figure size 700x500 with 1 Axes>

In [22]:
#we see that for c=1, we have the best fit model
# Final model
C = 1.0
classifier = train_classifier(X_train_tfidf, y_train, C=C, model='lr')

# Results
test_predictions =  classifier.predict(X_test_tfidf)
test_pred_inversed = mlb.inverse_transform(test_predictions)

#print out the tags for our test data
test_pred_inversed

[(),
 ('c', 'c++'),
 ('python',),
 ('javascript', 'twitterbootstrap'),
 ('javascript', 'jquery'),
 ('mysql', 'php'),
 (),
 ('javascript', 'jquery'),
 ('php',),
 (),
 ('laravel', 'php'),
 (),
 ('javascript',),
 ('javascript',),
 ('c#', 'java'),
 ('javascript',),
 ('c#',),
 ('java',),
 (),
 ('java',),
 (),
 ('ruby', 'rubyonrails', 'validation'),
 ('css', 'javascript', 'twitterbootstrap'),
 ('html', 'javascript'),
 (),
 ('c',),
 ('c#', 'linq'),
 ('javascript',),
 ('ios', 'objectivec', 'xcode'),
 ('python',),
 ('java',),
 ('php',),
 (),
 ('database', 'mysql', 'php'),
 ('laravel', 'php'),
 ('mysql', 'php'),
 ('rubyonrails',),
 ('css', 'regex'),
 ('c#', 'json'),
 ('c++', 'windows'),
 ('objectivec',),
 ('mysql', 'php'),
 ('database', 'django', 'python'),
 ('ios', 'swift'),
 ('javascript',),
 (),
 (),
 (),
 ('html', 'java', 'javascript'),
 ('string',),
 (),
 ('c++',),
 (),
 ('xml',),
 ('php',),
 ('sockets',),
 (),
 ('javascript', 'jquery'),
 ('c#', 'visualstudio2010'),
 ('javascript', 'jquery'

In [35]:
test_pred_inversed[3]

('javascript', 'twitterbootstrap')

In [38]:
X_test[3]

'bootstrap dropdown wrong place'

In [39]:
#after completion, we just check the top words associated with some common tags
def print_words_for_tag(classifier, tag, tags_classes, index_to_words, all_words):
    """
        classifier: trained classifier
        tag: particular tag
        tags_classes: a list of classes names from MultiLabelBinarizer
        index_to_words: index_to_words transformation
        all_words: all words in the dictionary
        
        return nothing, just print top 8 positive and top 8 negative words for current tag
    """
    print('Tag:\t{}'.format(tag))
    
    tag_n = np.where(tags_classes==tag)[0][0]
    
    model = classifier.estimators_[tag_n]
    top_positive_words = [index_to_words[x] for x in model.coef_.argsort().tolist()[0][-8:]]
    top_negative_words = [index_to_words[x] for x in model.coef_.argsort().tolist()[0][:8]]
    
    print('Top positive words:\t{}'.format(', '.join(top_positive_words)))
    print('Top negative words:\t{}\n'.format(', '.join(top_negative_words)))


print_words_for_tag(classifier, 'c', mlb.classes_, tfidf_reversed_vocab, ALL_WORDS)
print_words_for_tag(classifier, 'c++', mlb.classes_, tfidf_reversed_vocab, ALL_WORDS)
print_words_for_tag(classifier, 'linux', mlb.classes_, tfidf_reversed_vocab, ALL_WORDS)
print_words_for_tag(classifier, 'python', mlb.classes_, tfidf_reversed_vocab, ALL_WORDS)
print_words_for_tag(classifier, 'r', mlb.classes_, tfidf_reversed_vocab, ALL_WORDS)
print_words_for_tag(classifier, 'java', mlb.classes_, tfidf_reversed_vocab, ALL_WORDS)

Tag:	c
Top positive words:	fork, gtk, fscanf, kernel, printf, scanf, malloc, c
Top negative words:	php, python, javascript, java, objective c, c#, jquery, swift

Tag:	c++
Top positive words:	c++11, boostasio, mfc, stl, stdstring, boost, qt, c++
Top negative words:	php, java, javascript, python, c#, jquery, objectivec, swift

Tag:	linux
Top positive words:	address, shared, ubuntu, centos, killed, cron, kernel space, linux
Top negative words:	javascript, c#, jquery, array, method, windows, value, string

Tag:	python
Top positive words:	beautifulsoup, flask, sqlalchemy, tkinter, matplotlib, numpy, pandas, python
Top negative words:	php, c#, java, django python, javascript, jquery, r, c++

Tag:	r
Top positive words:	rs, data frame, dplyr, shiny, rstudio, ggplot, ggplot2, r
Top negative words:	python, android, php, java, javascript, pandas, c#, n

Tag:	java
Top positive words:	tomcat, jtable, javafx, android, jar, hibernate, spring, java
Top negative words:	php, python, c#, rails, django, r