In [116]:
import numpy as np 
import pandas as pd
import quora_vocab as qv
from sklearn import linear_model
import utilities as ut
from importlib import reload
from bokeh.plotting import figure
from bokeh.io import show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource
output_notebook()

In [117]:
DATA_PATH = '~/google_drive/data/quora/'
DATA_FILE = '{}{}'.format(DATA_PATH,'train.csv')

### Data Loading

In [118]:
data = pd.read_csv(DATA_FILE)
data.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


### Tokenizing Each Comment

In [119]:
data_pos_tokenized = [[vec[0],ut.canon_token_sentence(vec[1]),vec[2]] 
                      for vec in data.as_matrix() if vec[2] == 1]  
data_neg_tokenized = [[vec[0],ut.canon_token_sentence(vec[1]),vec[2]] 
                      for vec in data.as_matrix() if vec[2] == 0]   

  
  after removing the cwd from sys.path.


#### Tokenized Innapropriate Question Example

In [354]:
' '.join(data_pos_tokenized[104][1])

"was russia's invasion on ukraine a part of polish revenge plan for what happened in wolyn when ukrainians brutally murdered tens thousands of polish to settle down there"

#### Tokenized Appropriate Question Example

In [355]:
' '.join(data_neg_tokenized[11][1])

'how were the calgary flames founded'

### Train Test Split

In [123]:
NVALID = 5000
from random import shuffle
train_data = data_pos_tokenized[NVALID:] + data_neg_tokenized[NVALID:]
shuffle(train_data)
valid_data = data_pos_tokenized[:NVALID] + data_neg_tokenized[:NVALID]
shuffle(valid_data)

### Class Construction

In [361]:
reload(qv)
comments = qv.CommentVocab(valid_data)

Processing Comments: 100%|██████████| 10000/10000 [00:00<00:00, 56073.66it/s]


### Comparison of Question Length by Class

In [362]:
pos_lengths = [ len(vec[1]) for vec in data_pos_tokenized]
neg_lengths = [ len(vec[1]) for vec in data_neg_tokenized]
print('{}\n{}\n'.format('----Length of Positive Examples-----',pd.Series(pos_lengths).describe()))
print('{}\n{}'.format('----Length of Negative Examples-----',pd.Series(neg_lengths).describe()))

----Length of Positive Examples-----
count    80810.000000
mean        17.462888
std          9.686994
min          1.000000
25%         10.000000
50%         15.000000
75%         23.000000
max         90.000000
dtype: float64

----Length of Negative Examples-----
count    1.225312e+06
mean     1.263137e+01
std      6.843014e+00
min      2.000000e+00
25%      8.000000e+00
50%      1.100000e+01
75%      1.500000e+01
max      1.330000e+02
dtype: float64


In [363]:
comments.comment_length_graph()

#### Comments

* We can see from the summary statistics and the empirical distribution that the distribution of inappropriate questions has far fatter tails than the distribution of appropriate questions demonstrating that innaproriate questions tend to be somewhat longer. This matches what one would expect from the description of the innapropriate class which includes questions that are really statements of some position that the user has, or just generalized trolling.

* On the other hand one can interpret the relative shortness of appropriate question as reflecting more concise and clearly stated questions. 

### Comparison of Word Frequency By Class

In [359]:
comments.word_frequency_graphs(min_rank=3,max_rank=50)

In [360]:
comments.word_count_difference_graph(20)

#### Comments

* The preceeding graphs show the differneces in word frequency between classes.

* One thing that the last graph shows, which one should expect, is the higher frequency ,in the inappropriate class, of words that that are either polarizing e.g. 'trump' or are groups who are often subject to claims of supremacy or inferiority depending on the prejudices of the asker e.g. 'white', 'men', 'women', 'muslims'.

* One interesting unexpected result is the difference in the frequency of different interogative words used in the different classes of questions e.g. 'what', 'how', and 'which' are more frequent in the appropriate class, while interogative word 'why' is far more likely in the innappropriate class. 

* Perhaps the increased use of 'why' in the innapropriate class is due to the ease at which one can disguise a statement of a dubious, non-factual nature as a why question. For example the question 'why are aliens manging my local dairy queen?' presents the premise, that aliens are managing a dairy queen somewhere, as fact and implicitly requires the reader to accept the premise in order to respond directly to it. Note that questions like, 'which of my local dairy queens is managed by aliens', 'how can my local dairy queen be managed by aliens', and 'does/can my local dairy queen be managed by aliens' do not require the answerer to accept the premise in order to respond. 

In [397]:
neg_bigram_counts_reformat = [[('{}_{}'.format(word1,word2),count) for word2,count in words.items()]
                            for word1,words in list(comments.bigram_counts[0].items())]
neg_bigram_counts_reformat = [ item for subl in neg_bigram_counts_reformat for item in subl]
neg_bigram_counts_reformat = sorted(neg_bigram_counts_reformat,key = lambda x: -x[1])

pos_bigram_counts_reformat = [[('{}_{}'.format(word1,word2),count) for word2,count in words.items()]
                            for word1,words in list(comments.bigram_counts[1].items())]
pos_bigram_counts_reformat = [ item for subl in pos_bigram_counts_reformat for item in subl]
pos_bigram_counts_reformat = sorted(pos_bigram_counts_reformat,key = lambda x: -x[1])

neg_bigram_vocab,_ = zip(*neg_bigram_counts_reformat)
pos_bigram_vocab,_ = zip(*pos_bigram_counts_reformat)

bigram_vocab = set(neg_bigram_vocab + pos_bigram_vocab)
for bigram in bigram_vocab: 
    split = bigram.split('_')
    if len(split) > 2: 
        print(split)
# for bigram in bigram_vocab: 
#     word1,word2 = bigram.split('_')
#     neg_count_dict = comments.bigram_counts[0].get(word1,0)
#     pos_count_dict = comments.bigram_counts[1].get(word1,0)
#     if neg_count_dict != 0:
#         neg_count = neg_count_dict.get(word2)
#     else: 
#         neg_count = 0
#     if pos_count_dict != 0:
#         pos_count = pos_count_dict.get(word2)
#     else: 
#         pos_count = 0
        

['a', '{x}<b', '{x}<c', '{x}<d', '{x}\\right]\\in', 'x', '{x}\\mapsto']
['*xx', '360,420,666', 'xx*', 'why']
['of', '*xx', '360,420,666', 'xx*']
['d', '{x}', '1\\right']
['x', '{x}\\mapsto', 'x', '{x+1}']
['math]x\\to\\infty\\displaystyle\\lim', '{x=1}\\frac{d', '{x}}{\\left', 'x+1\\right']
['math]\\left', 'd', '{x}']
['math]\\left', 'a', '{x}<b', '{x}<c', '{x}<d', '{x}\\right]\\in']
['x', '{x+1}', '']
['\\left', 'a', '{x}']
['as', 'math]x\\to\\infty\\displaystyle\\lim', '{x=1}\\frac{d', '{x}}{\\left']
['a', '{x}', '1\\right']


In [390]:
pos_bigram_counts_reformat[:10]

[('<s>_why', 1872),
 ('why_do', 759),
 ('<s>_how', 402),
 ('why_are', 401),
 ('<s>_is', 388),
 ('is_it', 345),
 ('<s>_what', 312),
 ('in_the', 297),
 ('<s>_do', 291),
 ('why_is', 281)]

In [395]:
'word1_word2'.split('_')

['word1', 'word2']