In [1]:
# imports
import tomotopy as tp
import pandas as pd
import re
import numpy as np

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [2]:
# load in dataset
post_words = pd.read_csv('data/train.csv')
post_words.head()

Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y
0,34552656,Java: Repeat Task Every Random Seconds,<p>I'm already familiar with repeating tasks e...,<java><repeat>,2016-01-01 00:21:59,LQ_CLOSE
1,34553034,Why are Java Optionals immutable?,<p>I'd like to understand why Java 8 Optionals...,<java><optional>,2016-01-01 02:03:20,HQ
2,34553174,Text Overlay Image with Darkened Opacity React...,<p>I am attempting to overlay a title over an ...,<javascript><image><overlay><react-native><opa...,2016-01-01 02:48:24,HQ
3,34553318,Why ternary operator in swift is so picky?,"<p>The question is very simple, but I just cou...",<swift><operators><whitespace><ternary-operato...,2016-01-01 03:30:17,HQ
4,34553755,hide/show fab with scale animation,<p>I'm using custom floatingactionmenu. I need...,<android><material-design><floating-action-but...,2016-01-01 05:21:48,HQ


In [3]:
# separates code from body and puts into new column
def separate_code(df):
    df['Code'] = df['Body'].map(lambda x: re.findall(r'(?s)(?<=<code>)(.*?)(?=<\/code>)', x) if re.findall(r'(?s)(?<=<code>)(.*?)(?=<\/code>)', x) != [] else [""])
    
# cleans body by removing code, tags, extra space, and change all case to lower, removes stopwords
def clean_df(df):
    df['Body_processed'] = df['Body'].map(lambda x: re.sub(r'(?s)(?<=<code>)(.*?)(?=<\/code>)','', x))
    df['Body_processed'] = df['Body_processed'].map(lambda x: re.sub('<.*?>', '', x))
    df['Body_processed'] = df['Body_processed'].map(lambda x: re.sub('[^\w\s]', '', x))
    df['Body_processed'] = df['Body_processed'].map(lambda x: re.sub('\\n+', ' ', x))
    df['Body_processed'] = df['Body_processed'].map(lambda x: re.sub('\/', ' ', x))
    df['Body_processed'] = df['Body_processed'].map(lambda x: re.sub(' [ ]+', ' ', x))
    df['Body_processed'] = df['Body_processed'].map(lambda x: x.lower())
    
    stop = set(stopwords.words('english'))
    df['Body_processed'] = df['Body_processed'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    
    df['Title_processed'] = df['Title'].map(lambda x: re.sub(r'(?s)(?<=<code>)(.*?)(?=<\/code>)','', x))
    df['Title_processed'] = df['Title_processed'].map(lambda x: re.sub('<.*?>', '', x))
    df['Title_processed'] = df['Title_processed'].map(lambda x: re.sub('[^\w\s]', '', x))
    df['Title_processed'] = df['Title_processed'].map(lambda x: re.sub('\\n+', ' ', x))
    df['Title_processed'] = df['Title_processed'].map(lambda x: re.sub('\/', ' ', x))
    df['Title_processed'] = df['Title_processed'].map(lambda x: re.sub(' [ ]+', ' ', x))
    df['Title_processed'] = df['Title_processed'].map(lambda x: x.lower())
    
    df['Title_processed'] = df['Title_processed'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))


# converts Tags column to functional list
def to_lst(tag):
    pattern = r"<([^<>]*)>"
    return re.findall(pattern, tag)

# apply data cleaning functions
separate_code(post_words)
clean_df(post_words)
post_words['Tags'] = post_words["Tags"].apply(to_lst)

post_words.head()

Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y,Code,Body_processed,Title_processed
0,34552656,Java: Repeat Task Every Random Seconds,<p>I'm already familiar with repeating tasks e...,"[java, repeat]",2016-01-01 00:21:59,LQ_CLOSE,[],im already familiar repeating tasks every n se...,java repeat task every random seconds
1,34553034,Why are Java Optionals immutable?,<p>I'd like to understand why Java 8 Optionals...,"[java, optional]",2016-01-01 02:03:20,HQ,[],id like understand java 8 optionals designed i...,java optionals immutable
2,34553174,Text Overlay Image with Darkened Opacity React...,<p>I am attempting to overlay a title over an ...,"[javascript, image, overlay, react-native, opa...",2016-01-01 02:48:24,HQ,[//component for article preview touchable ima...,attempting overlay title image image darkened ...,text overlay image darkened opacity react native
3,34553318,Why ternary operator in swift is so picky?,"<p>The question is very simple, but I just cou...","[swift, operators, whitespace, ternary-operato...",2016-01-01 03:30:17,HQ,"[return x == 0? """" : ""Hello""\n, return x == 0 ...",question simple could find answer doesnt compi...,ternary operator swift picky
4,34553755,hide/show fab with scale animation,<p>I'm using custom floatingactionmenu. I need...,"[android, material-design, floating-action-but...",2016-01-01 05:21:48,HQ,[],im using custom floatingactionmenu need implem...,hideshow fab scale animation


In [4]:
# creates dataframes with only posts that contain code and no code
codeidx = post_words["Code"].str[0] != ''
has_code = post_words[codeidx]
no_code = post_words[-codeidx]

In [5]:
# creates dataframes with only posts that are labeled as high quality, low quality closed, or low quality edited
hq = post_words[post_words["Y"] == "HQ"]
lq_close = post_words[post_words["Y"] == "LQ_CLOSE"]
lq_edit = post_words[post_words["Y"] == "LQ_EDIT"]

In [6]:
# PLDA model, uses Title, Body_processed, and Code as features
def model(df, test_on, name):
    
    # combines features into one, converts to list of words
    def formatting(df):
        df['Input'] = df['Title_processed'].str.lower() + ' ' + df['Body_processed'] + ' ' + df['Code'].str[0]
        df['Input'] = df['Input'].str.split()

    tomo_df = df.copy()
    formatting(tomo_df)

    fin = tomo_df[['Input', 'Tags']]
    
    train, test = train_test_split(fin, train_size = 0.12, shuffle=True)
    plda_mdl = tp.PLDAModel()

    # trains model
    for index, row in train.iterrows():
        plda_mdl.add_doc(row['Input'], row['Tags'])
    for _ in range(10):
        plda_mdl.train(10)
#         print('Iteration: {}\tLog-likelihood: {}'.format(i, plda_mdl.ll_per_word))
    
    # calculates train accuracy
    count = 0
    total = 0
    for i in train.index[:test_on]:
        total += 1
        new_doc = fin['Input'][i]
        infer_doc = plda_mdl.make_doc(new_doc)
        infer_topic_dist = plda_mdl.infer(infer_doc)
        np.where(infer_topic_dist[0] == max(infer_topic_dist[0]))
        if plda_mdl.topic_label_dict[np.where(infer_topic_dist[0] == max(infer_topic_dist[0]))[0][0]] in fin['Tags'][i]:
            count += 1
#         print("accuracy: ", count/total)
    train_acc = count/total
    
    # calculates test accuracy and saves prediction to a csv
    count = 0
    total = 0
    predictions_test = []
    test_preds = test[:test_on].copy()
    for i in test.index[:test_on]:
        total += 1
        new_doc = fin['Input'][i]
        infer_doc = plda_mdl.make_doc(new_doc)
        infer_topic_dist = plda_mdl.infer(infer_doc)
        np.where(infer_topic_dist[0] == max(infer_topic_dist[0]))
        predictions_test.append(plda_mdl.topic_label_dict\
                                [np.where(infer_topic_dist[0] == \
                                          max(infer_topic_dist[0]))[0][0]])

        if predictions_test[-1] in fin['Tags'][i]:
            count += 1
    test_preds['Predictions'] = predictions_test
    test_preds.to_csv(name)
    test_acc = count/total
#     print(train_acc, test_acc)
    return train_acc, test_acc, plda_mdl

In [7]:
# tests model on dataset as a whole
tr1, te1, pwm = model(post_words, 2000, "pwm.csv")
print("post_words: ", tr1, te1)

post_words:  0.8355 0.4835


In [8]:
# tests model on the following subsets of the data:
# posts that have code
tr2, te2, hcm = model(has_code, 2000, "hcm.csv")
print("has_code: ", tr2, te2)

has_code:  0.8755 0.454


In [9]:
# posts that have no code
tr3, te3, ncm = model(no_code, 2000, "ncm.csv")
print("no_code: ", tr3, te3)

no_code:  0.828 0.416


In [10]:
# posts labeled as high quality
tr4, te4, hqm = model(hq, 2000, "hqm.csv")
print("hq: ", tr4, te4)

hq:  0.8483333333333334 0.315


In [11]:
# posts labeled as low quality close
tr5, te5, lcm = model(lq_close, 2000, "lcm.csv")
print("lq_close: ", tr5, te5)

lq_close:  0.8983333333333333 0.4785


In [12]:
# posts labeled as low quality edit
tr6, te6, lem = model(lq_edit, 2000, "lem.csv")
print("lq_edit: ", tr6, te6)

lq_edit:  0.8438888888888889 0.4475
