In [1]:
import numpy as np 
import pandas as pd
import quora_vocab as qv
from sklearn import linear_model
import utilities as ut
from importlib import reload
from bokeh.plotting import figure
from bokeh.io import show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource
output_notebook()

In [2]:
DATA_PATH = '~/google_drive/data/quora/'
DATA_FILE = '{}{}'.format(DATA_PATH,'train.csv')

### Data Loading

In [3]:
data = pd.read_csv(DATA_FILE)
data.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


### Tokenizing Each Comment

In [4]:
data_pos_tokenized = [[vec[0],ut.canon_token_sentence(vec[1]),vec[2]] 
                      for vec in data.as_matrix() if vec[2] == 1]  
data_neg_tokenized = [[vec[0],ut.canon_token_sentence(vec[1]),vec[2]] 
                      for vec in data.as_matrix() if vec[2] == 0]   

  
  after removing the cwd from sys.path.


#### Tokenized Innapropriate Question Example

In [5]:
' '.join(data_pos_tokenized[103][1])

'what do democrats think of the fact that i am homophobic and was born this way will this make you rethink your rhetoric'

#### Tokenized Appropriate Question Example

In [6]:
' '.join(data_neg_tokenized[11][1])

'how were the calgary flames founded'

### Train Test Split

In [7]:
NVALID = 5000
from random import shuffle
train_data = data_pos_tokenized[NVALID:] + data_neg_tokenized[NVALID:]
shuffle(train_data)
valid_data = data_pos_tokenized[:NVALID] + data_neg_tokenized[:NVALID]
shuffle(valid_data)

### Class Construction

In [8]:
reload(qv)
comments = qv.CommentVocab(valid_data)

Processing Comments: 100%|██████████| 10000/10000 [00:00<00:00, 60172.38it/s]


### Comparison of Question Length by Class

In [9]:
pos_lengths = [ len(vec[1]) for vec in data_pos_tokenized]
neg_lengths = [ len(vec[1]) for vec in data_neg_tokenized]
print('{}\n{}\n'.format('----Length of Positive Examples-----',pd.Series(pos_lengths).describe()))
print('{}\n{}'.format('----Length of Negative Examples-----',pd.Series(neg_lengths).describe()))

----Length of Positive Examples-----
count    80810.000000
mean        17.425282
std          9.641367
min          1.000000
25%         10.000000
50%         15.000000
75%         23.000000
max         83.000000
dtype: float64

----Length of Negative Examples-----
count    1.225312e+06
mean     1.260752e+01
std      6.811622e+00
min      2.000000e+00
25%      8.000000e+00
50%      1.100000e+01
75%      1.500000e+01
max      1.320000e+02
dtype: float64


In [10]:
comments.comment_length_graph()

#### Comments

* We can see from the summary statistics and the empirical distribution that the distribution of inappropriate questions has far fatter tails than the distribution of appropriate questions demonstrating that innaproriate questions tend to be somewhat longer. This matches what one would expect from the description of the innapropriate class which includes questions that are really statements of some position that the user has, or just generalized trolling.

* On the other hand one can interpret the relative shortness of appropriate question as reflecting more concise and clearly stated questions. 

### Comparison of Word Frequency By Class

In [11]:
comments.word_frequency_graphs(min_rank=3,max_rank=50)

In [12]:
comments.word_count_difference_graph(20)

#### Comments

* The preceeding graphs show the differneces in word frequency between classes.

* One thing that the last graph shows, which one should expect, is the higher frequency ,in the inappropriate class, of words that that are either polarizing e.g. 'trump' or are groups who are often subject to claims of supremacy or inferiority depending on the prejudices of the asker e.g. 'white', 'men', 'women', 'muslims'.

* One interesting unexpected result is the difference in the frequency of different interogative words used in the different classes of questions e.g. 'what', 'how', and 'which' are more frequent in the appropriate class, while interogative word 'why' is far more likely in the innappropriate class. 

* Perhaps the increased use of 'why' in the innapropriate class is due to the ease at which one can disguise a statement of a dubious, non-factual nature as a why question. For example the question 'why are aliens manging my local dairy queen?' presents the premise, that aliens are managing a dairy queen somewhere, as fact and implicitly requires the reader to accept the premise in order to respond directly to it. Note that questions like, 'which of my local dairy queens is managed by aliens', 'how can my local dairy queen be managed by aliens', and 'does/can my local dairy queen be managed by aliens' do not require the answerer to accept the premise in order to respond. 

In [13]:
comments.word_frequency_graphs(unigram=False,min_rank=3,max_rank=50)

In [14]:
comments.word_count_difference_graph(unigram=False,num=20)

#### Comments

* Here when we examine bigrams and find the same trends as we found in the analysis of unigrams 

* In the inappropriate example we find a comparitively high frequency of 'why' interogative words and phrases which can be formulated as a statement of dubious facts, that implicitly require the answer to assume this premise in order to answer it. 

In [15]:
import quora_ngram as ng

In [16]:
reload(ng)
ng_model = ng.NgramModel(train_data)

Processing Comments: 100%|██████████| 1296122/1296122 [00:39<00:00, 32955.87it/s]


In [17]:
ng_model.train_classifier(gram_length=2,smoothing='jelinek-mercer',scnd_smoother='additive',
                          cnvx_param=0.5,smth_param=0.7)
# ng_model.gram_frequency

Processing Gram: 100%|██████████| 2899764/2899764 [3:56:01<00:00, 204.77it/s]    


In [18]:
valid_comments = [comment[1] for comment in valid_data]
valid_labels = [comment[2] for comment in valid_data]
ng_model.evaluate_classifier(valid_comments,valid_labels)

------Confusion Matrix------
 [[4736  264]
 [1576 3424]]
------Report------
              precision    recall  f1-score   support

           0       0.75      0.95      0.84      5000
           1       0.93      0.68      0.79      5000

    accuracy                           0.82     10000
   macro avg       0.84      0.82      0.81     10000
weighted avg       0.84      0.82      0.81     10000

