## Import requires libraries

In [1]:
import pandas as pd
import numpy as np
import re
import spacy
from tqdm import tqdm
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth', 200)

## Read the dataset

In [2]:
data=pd.read_hdf('auto_tagging_data_v2.h5')

In [3]:
data.head()

Unnamed: 0,Id,Title,Body,Tags
0,6,The Two Cultures: statistics vs. machine learning?,"<p>Last year, I read a blog post from <a href=""http://anyall.org/"">Brendan O'Connor</a> entitled <a href=""http://anyall.org/blog/2008/12/statistics-vs-machine-learning-fight/"">""Statistics vs. Mach...",[machine-learning]
1,21,Forecasting demographic census,<p>What are some of the ways to forecast demographic census with some validation and calibration techniques?</p>\n\n<p>Some of the concerns:</p>\n\n<ul>\n<li>Census blocks vary in sizes as rural\n...,[forecasting]
2,22,Bayesian and frequentist reasoning in plain English,<p>How would you describe in plain English the characteristics that distinguish Bayesian from Frequentist reasoning?</p>\n,[bayesian]
3,31,What is the meaning of p values and t values in statistical tests?,"<p>After taking a statistics course and then trying to help fellow students, I noticed one subject that inspires much head-desk banging is interpreting the results of statistical hypothesis tests....","[hypothesis-testing, t-test, p-value, interpretation]"
4,36,Examples for teaching: Correlation does not mean causation,"<p>There is an old saying: ""Correlation does not mean causation"". When I teach, I tend to use the following standard examples to illustrate this point:</p>\n\n<ol>\n<li>number of storks and birth ...",[correlation]


In [4]:
#combine title and body
data['text']=data['Title']+ " " +data['Body']

In [5]:
data.head()

Unnamed: 0,Id,Title,Body,Tags,text
0,6,The Two Cultures: statistics vs. machine learning?,"<p>Last year, I read a blog post from <a href=""http://anyall.org/"">Brendan O'Connor</a> entitled <a href=""http://anyall.org/blog/2008/12/statistics-vs-machine-learning-fight/"">""Statistics vs. Mach...",[machine-learning],"The Two Cultures: statistics vs. machine learning? <p>Last year, I read a blog post from <a href=""http://anyall.org/"">Brendan O'Connor</a> entitled <a href=""http://anyall.org/blog/2008/12/statisti..."
1,21,Forecasting demographic census,<p>What are some of the ways to forecast demographic census with some validation and calibration techniques?</p>\n\n<p>Some of the concerns:</p>\n\n<ul>\n<li>Census blocks vary in sizes as rural\n...,[forecasting],Forecasting demographic census <p>What are some of the ways to forecast demographic census with some validation and calibration techniques?</p>\n\n<p>Some of the concerns:</p>\n\n<ul>\n<li>Census ...
2,22,Bayesian and frequentist reasoning in plain English,<p>How would you describe in plain English the characteristics that distinguish Bayesian from Frequentist reasoning?</p>\n,[bayesian],Bayesian and frequentist reasoning in plain English <p>How would you describe in plain English the characteristics that distinguish Bayesian from Frequentist reasoning?</p>\n
3,31,What is the meaning of p values and t values in statistical tests?,"<p>After taking a statistics course and then trying to help fellow students, I noticed one subject that inspires much head-desk banging is interpreting the results of statistical hypothesis tests....","[hypothesis-testing, t-test, p-value, interpretation]","What is the meaning of p values and t values in statistical tests? <p>After taking a statistics course and then trying to help fellow students, I noticed one subject that inspires much head-desk b..."
4,36,Examples for teaching: Correlation does not mean causation,"<p>There is an old saying: ""Correlation does not mean causation"". When I teach, I tend to use the following standard examples to illustrate this point:</p>\n\n<ol>\n<li>number of storks and birth ...",[correlation],"Examples for teaching: Correlation does not mean causation <p>There is an old saying: ""Correlation does not mean causation"". When I teach, I tend to use the following standard examples to illustra..."


## Clean and preprocess the data

In [6]:
def clean_text(text):
    text=text.lower()
    # remove html tags and url links
    text = re.sub(r'<.*?>', '', text)
    # remove everything other than alphabets
    text = re.sub("[^a-zA-Z]"," ",text)
    # remove whitespaces
    text = ' '.join(text.split())
    
    return text

In [7]:
data['Cleaned Text']=data['text'].apply(clean_text)

In [8]:
data.head()

Unnamed: 0,Id,Title,Body,Tags,text,Cleaned Text
0,6,The Two Cultures: statistics vs. machine learning?,"<p>Last year, I read a blog post from <a href=""http://anyall.org/"">Brendan O'Connor</a> entitled <a href=""http://anyall.org/blog/2008/12/statistics-vs-machine-learning-fight/"">""Statistics vs. Mach...",[machine-learning],"The Two Cultures: statistics vs. machine learning? <p>Last year, I read a blog post from <a href=""http://anyall.org/"">Brendan O'Connor</a> entitled <a href=""http://anyall.org/blog/2008/12/statisti...",the two cultures statistics vs machine learning last year i read a blog post from brendan o connor entitled statistics vs machine learning fight that discussed some of the differences between the ...
1,21,Forecasting demographic census,<p>What are some of the ways to forecast demographic census with some validation and calibration techniques?</p>\n\n<p>Some of the concerns:</p>\n\n<ul>\n<li>Census blocks vary in sizes as rural\n...,[forecasting],Forecasting demographic census <p>What are some of the ways to forecast demographic census with some validation and calibration techniques?</p>\n\n<p>Some of the concerns:</p>\n\n<ul>\n<li>Census ...,forecasting demographic census what are some of the ways to forecast demographic census with some validation and calibration techniques some of the concerns census blocks vary in sizes as rural ar...
2,22,Bayesian and frequentist reasoning in plain English,<p>How would you describe in plain English the characteristics that distinguish Bayesian from Frequentist reasoning?</p>\n,[bayesian],Bayesian and frequentist reasoning in plain English <p>How would you describe in plain English the characteristics that distinguish Bayesian from Frequentist reasoning?</p>\n,bayesian and frequentist reasoning in plain english how would you describe in plain english the characteristics that distinguish bayesian from frequentist reasoning
3,31,What is the meaning of p values and t values in statistical tests?,"<p>After taking a statistics course and then trying to help fellow students, I noticed one subject that inspires much head-desk banging is interpreting the results of statistical hypothesis tests....","[hypothesis-testing, t-test, p-value, interpretation]","What is the meaning of p values and t values in statistical tests? <p>After taking a statistics course and then trying to help fellow students, I noticed one subject that inspires much head-desk b...",what is the meaning of p values and t values in statistical tests after taking a statistics course and then trying to help fellow students i noticed one subject that inspires much head desk bangin...
4,36,Examples for teaching: Correlation does not mean causation,"<p>There is an old saying: ""Correlation does not mean causation"". When I teach, I tend to use the following standard examples to illustrate this point:</p>\n\n<ol>\n<li>number of storks and birth ...",[correlation],"Examples for teaching: Correlation does not mean causation <p>There is an old saying: ""Correlation does not mean causation"". When I teach, I tend to use the following standard examples to illustra...",examples for teaching correlation does not mean causation there is an old saying correlation does not mean causation when i teach i tend to use the following standard examples to illustrate this p...


In [9]:
data[['Id', 'Cleaned Text', 'Tags']].sample(5)

Unnamed: 0,Id,Cleaned Text,Tags
22191,5727,james stein estimator how did efron and morris calculate sigma in shrinkage factor for their baseball example i have a question on calculating james stein shrinkage factor in the scientific americ...,[estimation]
33743,115058,best method of estimating the output rate of a group i have a sample dataset consisting of fields output volume and input time in hours for different persons performing an identical task several t...,[mean]
75675,177002,how are the control limit table constants calculated for reference https controls engin umich edu wiki images control chart constants jpg in a given set of manufacturing data you can determine the...,[standard-deviation]
37135,33278,dummy coding categorical variables regression mediation i have more than one question if you can help i have a regression model with control variables each with categories i am wondering if there ...,[regression]
11572,212987,residuals as indication of transformation of data in r i have data and i want to make a regression analysis finding a function that can fit the data so head data gives promotion new users to find ...,"[regression, residuals]"


## Removing stopwords

In [10]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [11]:
def strip_stopwords(text):
    clean_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(clean_text)

In [12]:
data['Cleaned Text']=data['Cleaned Text'].apply(strip_stopwords)

## Reshape target variable

In [13]:
from sklearn.preprocessing import MultiLabelBinarizer

In [14]:
multilabel_binarizer=MultiLabelBinarizer()
multilabel_binarizer.fit(data['Tags'])

#converting the target label
y=multilabel_binarizer.transform(data['Tags'])

In [15]:
y.shape

(76365, 100)

## Feature extraction

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
vect=TfidfVectorizer(max_df=0.8,max_features=10000)
X_tfidf =vect.fit_transform(data['Cleaned Text'])

## Train Test split

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
# split dataset into training and validation set
x_train_tfidf, x_val_tfidf, y_train_tfidf, y_val_tfidf = train_test_split(X_tfidf, y, test_size=0.2, random_state=9)

## Model building

In [20]:
from sklearn.linear_model import LogisticRegression

# Binary Relevance
from sklearn.multiclass import OneVsRestClassifier

# Performance metric
from sklearn.metrics import f1_score

In [21]:
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)

In [22]:
# fit model on train data
clf.fit(x_train_tfidf, y_train_tfidf)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

OneVsRestClassifier(estimator=LogisticRegression())

## Predictions and performance metrics

In [23]:
#making predictions on validation set
y_pred = clf.predict(x_val_tfidf)

In [53]:
y_pred[:3]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0,

In [51]:
multilabel_binarizer.inverse_transform(y_pred)[:3] #predicted tags

[('prediction',), ('distributions', 'mean', 'variance'), ('r',)]

In [52]:
multilabel_binarizer.inverse_transform(y_val_tfidf)[:3] #actual tags

[('confidence-interval', 'regression'),
 ('distributions', 'mean', 'variance'),
 ('bayesian', 'r')]

In [26]:
#evaluate performance
f1_score(y_val_tfidf, y_pred, average=None)

array([0.04040404, 0.62266501, 0.63430421, 0.41269841, 0.57627119,
       0.15151515, 0.359375  , 0.688     , 0.5326087 , 0.23913043,
       0.51530612, 0.43896976, 0.72303207, 0.18181818, 0.57769653,
       0.59310345, 0.33513514, 0.49367089, 0.64065708, 0.08910891,
       0.30662021, 0.42592593, 0.01298701, 0.40740741, 0.29063509,
       0.05494505, 0.17142857, 0.27272727, 0.26506024, 0.5388601 ,
       0.34558824, 0.5158371 , 0.41395349, 0.31944444, 0.35056968,
       0.05594406, 0.48201439, 0.08465608, 0.28193833, 0.03125   ,
       0.65042174, 0.37931034, 0.01104972, 0.44029851, 0.56047198,
       0.36619718, 0.17346939, 0.43333333, 0.54253612, 0.        ,
       0.1875    , 0.        , 0.39370079, 0.26506024, 0.32900433,
       0.18837675, 0.17910448, 0.7617689 , 0.10344828, 0.28310502,
       0.38511327, 0.24752475, 0.47560976, 0.52325581, 0.14117647,
       0.52918288, 0.7761807 , 0.29032258, 0.49781659, 0.06451613,
       0.06617647, 0.40333333, 0.27692308, 0.36666667, 0.56498

In [27]:
np.mean(f1_score(y_val_tfidf, y_pred, average=None))

0.35117052776248087

In [28]:
# evaluate performance
f1_score(y_val_tfidf, y_pred, average="micro")

0.434766545051494

In [29]:
# evaluate performance
f1_score(y_val_tfidf, y_pred, average="macro")

0.35117052776248087

In [30]:
# predict probabilities
y_pred_prob = clf.predict_proba(x_val_tfidf)

In [31]:
# set threshold value
t = 0.45

# convert to integers
y = (y_pred_prob >= t).astype(int)
f1_score(y_val_tfidf, y, average="micro")

0.4594438006952491

## Inference

In [32]:
def infer_tags(q):
    q = clean_text(q)
    q = q.lower()
    q = strip_stopwords(q)
    q_vec = vect.transform([q])
    q_pred = clf.predict(q_vec)
    return multilabel_binarizer.inverse_transform(q_pred)

In [37]:
# give new question
new_q = "meaning p values values statistical tests taking statistics course trying help fellow students noticed one subject inspires much head desk banging interpreting results statistical hypothesis tests seems students easily learn perform calculations required given test get hung interpreting results many computerized tools report test results terms p values values would explain following points college students taking first course statistics p value mean relation hypothesis tested cases one looking high p value low p value relationship p value value?"

# get tags
infer_tags(new_q)

[('hypothesis-testing', 'p-value')]