# Prep for TF-IDF

In [1]:
# Import packages.
from io import BytesIO
from tensorflow.python.lib.io import file_io
import msgpack
import numpy as np
import pandas as pd

In [25]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_rows = 4000

In [4]:
# Read in data.
f = BytesIO(file_io.read_file_to_string('WM-PA-Binary-Data-min-6-votes.bin', binary_mode=True))
data = msgpack.unpack(f, raw=False)

In [5]:
df = pd.DataFrame(data)

In [6]:
# Pandas reads the data in with the cols and rows opposite of what we want.
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,115831,115832,115833,115834,115835,115836,115837,115838,115839,115840
label,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
text,"[[this, is, not, creative, .], [those, are, th...","[[the, term, standard, model, is, itself, less...","[[true, or, false, the, situation, as, of, mar...","[[next, maybe, you, could, work, on, being, le...","[[this, page, will, need, disambiguation, .]]","[[important, note, for, all, sysops, there, is...","[[i, removed, the, following, all, names, of, ...","[[if, you, ever, claimed, in, a, judaic, studi...","[[my, apologies, im, english, i, watch, cricke...","[[someone, wrote, more, recognizable, perhaps,...",...,"[[the, lead, itself, is, original, research, ....","[[well, done, thanks, !]]","[[im, talking, about, you, making, unjustified...","[[yes, from, the, word, guci, or, puci, meanin...","[[comment, .], [gentlemen, this, article, prov...","[[these, sources, dont, exactly, exude, a, sen...","[[the, institute, for, historical, review, is,...","[[the, way, youre, trying, to, describe, it, i...","[[warning, there, is, clearly, a, protectionis...","[[alternate, option, is, there, perhaps, enoug..."
idx,"[[14, 9, 16, 3450, 1], [139, 21, 2, 2165, 2892...","[[2, 373, 846, 2004, 9, 558, 452, 635, 97, 8, ...","[[350, 28, 505, 2, 903, 20, 5, 1073, 2331, 25,...","[[512, 360, 7, 108, 153, 18, 93, 452, 5839, 24...","[[14, 34, 46, 128, 1732, 1]]","[[383, 249, 15, 42, 9059, 44, 9, 6, 4994, 12, ...","[[8, 205, 2, 445, 42, 572, 5, 929, 1653, 6405,...","[[26, 7, 323, 1240, 12, 6, 20738, 1678, 1529, ...","[[30, 2250, 77, 271, 8, 893, 5331, 8, 74, 224,...","[[151, 498, 64, 14320, 386, 9, 6, 795, 5, 41, ...",...,"[[2, 588, 558, 9, 346, 397, 1], [127, 9, 2, 85...","[[100, 227, 112, 10]]","[[77, 544, 39, 7, 266, 5210, 648, 448, 1], [12...","[[336, 36, 2, 283, 36994, 28, 36994, 805, 5804...","[[215, 1], [10120, 14, 27, 2073, 31, 4804, 5, ...","[[113, 117, 55, 601, 36994, 6, 458, 5, 9090, 1...","[[2, 3105, 15, 849, 443, 9, 6, 1504, 1670, 187...","[[2, 110, 165, 244, 3, 1322, 13, 12, 14, 27, 9...","[[485, 44, 9, 335, 6, 36994, 3651, 148, 18, 68...","[[3845, 2317, 9, 44, 386, 327, 14698, 109, 3, ..."
rev_id,37675,44816,49851,89320,93890,102817,103624,111032,120283,128532,...,699756185,699780538,699813325,699820699,699822249,699848324,699851288,699857133,699891012,699897151


In [12]:
# Transpose df.
transposed_df = df.T
transposed_df.head()

Unnamed: 0,label,text,idx,rev_id
0,0,"[[this, is, not, creative, .], [those, are, th...","[[14, 9, 16, 3450, 1], [139, 21, 2, 2165, 2892...",37675
1,0,"[[the, term, standard, model, is, itself, less...","[[2, 373, 846, 2004, 9, 558, 452, 635, 97, 8, ...",44816
2,0,"[[true, or, false, the, situation, as, of, mar...","[[350, 28, 505, 2, 903, 20, 5, 1073, 2331, 25,...",49851
3,0,"[[next, maybe, you, could, work, on, being, le...","[[512, 360, 7, 108, 153, 18, 93, 452, 5839, 24...",89320
4,0,"[[this, page, will, need, disambiguation, .]]","[[14, 34, 46, 128, 1732, 1]]",93890


In [32]:
rev_id = transposed_df[transposed_df['rev_id'] =='5108359']

[print(x) for x in rev_id.text]
[print(x) for x in rev_id.idx] 

[['and', 'what', 'is', 'this', 'crew', 'member', 'nonsense', 'youre', 'throwing', 'out', 'on', 'page', 'after', 'page', '?']]


[None]

[[4, 41, 9, 14, 4931, 912, 798, 165, 3543, 84, 18, 34, 149, 34, 17]]


[None]

## Detokenize Docs

In [110]:
# Define function to apply to text column to detokenize the comments.
def detokenize_doc(lists_of_tokens):
    detokenized_doc = []
    for list_of_tokens in lists_of_tokens:
        sentences = ' '.join(list_of_tokens).replace(' , ',',').replace(' .','.').replace(' !','!')
        sentences = sentences.replace(' ?','?').replace(' : ',': ').replace(' \'', '\'')
        detokenized_doc.append(sentences)

    # Flatten. 
    doc = ''
    for sent in detokenized_doc:
        doc += sent + ' '
        
    return doc

In [111]:
# Concatenate tokens back together into documents for TfidVectorizer.
transposed_df.text = transposed_df.text.apply(detokenize_doc)
transposed_df.head()

Unnamed: 0,idx,label,rev_id,text
0,"[[14, 9, 16, 3450, 1], [139, 21, 2, 2165, 2892...",0,37675,this is not creative. those are the dictionary...
1,"[[2, 373, 846, 2004, 9, 558, 452, 635, 97, 8, ...",0,44816,the term standard model is itself less npov th...
2,"[[350, 28, 505, 2, 903, 20, 5, 1073, 2331, 25,...",0,49851,true or false the situation as of march 2002 w...
3,"[[512, 360, 7, 108, 153, 18, 93, 452, 5839, 24...",0,89320,next maybe you could work on being less condes...
4,"[[14, 34, 46, 128, 1732, 1]]",0,93890,this page will need disambiguation.


In [112]:
print(transposed_df.text[0])

this is not creative. those are the dictionary definitions of the terms insurance and ensurance as properly applied to destruction. if you dont understand that fine legitimate criticism ill write up three man cell and bounty hunter and then it will be easy to understand why ensured and insured are different and why both differ from assured. the sentence you quote is absolutely neutral. you just arent familiar with the underlying theory of strike back e. g. submarines as employed in nuclear warfare guiding the insurance nor likely the three man cell structure that kept the ira from being broken by the british. if thats my fault fine i can fix that to explain. but theres nothing personal or creative about it. im tired of arguing with you. re the other article multi party turns up plenty and there is more use of mutually than mutual. if i were to apply your standard id be moving mutual assured destruction to talk for not appealing to a reagan voters biases about its effectiveness and for 

In [113]:
csv_path = 'WM-PA-Binary-Data-min-6-votes-TF-IDF.csv'
transposed_df.to_csv(csv_path)