In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df_questions = pd.read_hdf('auto_tagging_data_v2.h5')
df_questions.head()

Unnamed: 0,Id,Title,Body,Tags
0,6,The Two Cultures: statistics vs. machine learn...,"<p>Last year, I read a blog post from <a href=...",[machine-learning]
1,21,Forecasting demographic census,<p>What are some of the ways to forecast demog...,[forecasting]
2,22,Bayesian and frequentist reasoning in plain En...,<p>How would you describe in plain English the...,[bayesian]
3,31,What is the meaning of p values and t values i...,<p>After taking a statistics course and then t...,"[hypothesis-testing, t-test, p-value, interpre..."
4,36,Examples for teaching: Correlation does not me...,"<p>There is an old saying: ""Correlation does n...",[correlation]


In [3]:
df_questions['Text'] = df_questions["Title"] + " " + df_questions["Body"]

In [4]:
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub("[^a-zA-Z]"," ",text)
    text = ' '.join(text.split())
    
    return text

In [7]:
df_questions['Text'] = df_questions['Text'].apply(lambda x: clean_text(x))
df_questions['Text'] = df_questions['Text'].str.lower()
df_questions.head()

Unnamed: 0,Id,Title,Body,Tags,Text
0,6,The Two Cultures: statistics vs. machine learn...,"<p>Last year, I read a blog post from <a href=...",[machine-learning],the two cultures statistics vs machine learnin...
1,21,Forecasting demographic census,<p>What are some of the ways to forecast demog...,[forecasting],forecasting demographic census what are some o...
2,22,Bayesian and frequentist reasoning in plain En...,<p>How would you describe in plain English the...,[bayesian],bayesian and frequentist reasoning in plain en...
3,31,What is the meaning of p values and t values i...,<p>After taking a statistics course and then t...,"[hypothesis-testing, t-test, p-value, interpre...",what is the meaning of p values and t values i...
4,36,Examples for teaching: Correlation does not me...,"<p>There is an old saying: ""Correlation does n...",[correlation],examples for teaching correlation does not mea...


In [8]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_questions['Text'])

In [9]:
len(tokenizer.word_index)

81956

In [10]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

81957

In [12]:
sequences = tokenizer.texts_to_sequences(df_questions['Text'])
print(df_questions['Text'][0])
print(sequences[0])

the two cultures statistics vs machine learning last year i read a blog post from brendan o connor entitled statistics vs machine learning fight that discussed some of the differences between the two fields andrew gelman responded favorably to this simon blomberg from r s fortunes package to paraphrase provocatively machine learning is statistics minus any checking of models and assumptions brian d ripley about the difference between machine learning and statistics user vienna may season s greetings andrew gelman in that case maybe we should get rid of checking of models and assumptions more often then maybe we d be able to solve some of the problems that the machine learning people can solve but we can t there was also the statistical modeling the two cultures paper by leo breiman in which argued that statisticians rely too heavily on data modeling and that machine learning techniques are making progress by instead relying on the predictive accuracy of models has the statistics field 

In [13]:
max_length = 125
padded_seq = pad_sequences(sequences, maxlen=max_length)

In [14]:
from sklearn.preprocessing import MultiLabelBinarizer
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df_questions['Tags'])
y = multilabel_binarizer.transform(df_questions['Tags'])

In [15]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(padded_seq,y,test_size=0.2,random_state=9)

In [16]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, MaxPooling1D, Dropout, Conv1D
from keras.callbacks import EarlyStopping

In [22]:
model = Sequential()
model.add(Embedding(vocab_size +1, 128, input_length = max_length))
model.add(Dropout(0.15))
model.add(Conv1D(300, 5, padding = 'valid', activation = "relu"))
model.add(MaxPooling1D())
model.add(Dense(100, activation = "sigmoid"))

In [23]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 125, 128)          10490624  
_________________________________________________________________
dropout_2 (Dropout)          (None, 125, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 123, 300)          115500    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 300)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 100)               30100     
Total params: 10,636,224
Trainable params: 10,636,224
Non-trainable params: 0
_________________________________________________________________


In [39]:
callbacks=[EarlyStopping(patience=3)]
history = model.fit(x_train, y_train,epochs=15,batch_size=128,validation_split=0.1,callbacks=callbacks)

Train on 54982 samples, validate on 6110 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15


In [40]:
model.save('autotag.h5')

In [18]:
from keras.models import load_model
model=load_model('autotag.h5')

In [19]:
def infer_tags(q):
    q = clean_text(q)
    q = q.lower()
    q_seq = tokenizer.texts_to_sequences([q])
    q_seq_padded = pad_sequences(q_seq, maxlen=125)
    q_pred = model.predict(q_seq_padded)
    q_pred = (q_pred >= 0.3).astype(int)
    
    return multilabel_binarizer.inverse_transform(q_pred)

In [21]:
new_q = "Regression line doesn't match computed regression Im using R and created a chart using ggplot2. I then create a regression so I can make some predicitions I pass my data frame of to the predict function predict(regression, Measures) I'd expect the predictions to be the same as if I used the regression line on the chart, but they aren't the same. Why would this be the case? Is there a setting in ggplot or is my expectation incorrect?"
infer_tags(new_q)

[('data-visualization', 'r', 'regression')]