In [1]:
import spacy
import re
import numpy as np
import pandas as pd
from collections import Counter
#sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import log_loss, recall_score, precision_score

from lib.processor import *

nlp = spacy.load('en')

In [2]:
data = pd.read_csv("dataset.csv")
data.head()

Unnamed: 0,text,label
0,In an interview with congressional investigato...,0
1,Getty - John Shearer / Staff \nComedian Patton...,1
2,"By Jameson Parker Election 2016 , Politics Nov...",1
3,Last updated at 16:53 GMT A helmet for cyclis...,0
4,13 Herbal Teas With Highest Antioxidants http:...,1


In [21]:
# data.text[151]

In [3]:
data_text = data.text[151]

def pre_process(data_text):
    ex = preprocess_text(data_text, remove_special = False, stem=False, lemmatize = False, remove_stops = False)
    ex = ex.replace("\n", "")
    ex = ex.replace("\r", "")
    doc = nlp(ex)
    
    return doc

cleaned_text = pre_process(data_text)
cleaned_text

Email Republican presidential candidate Donald Trump has been criticized for his response as to whether hed accept 2016 election results. His exact words: Ill keep you in suspense. Several pundits think that statement cost him the election. But read more broadly, the way the subliminal mind does, and his response was brilliant, revealing his astute instincts emanating from the newly discovered unconscious super intelligence (which we all possess). His word suspense implies an unfolding drama. Indeed, the depth of this drama is immeasurable. But Trumps telling America a secret story. His story is told through his super intelligence which quick reads situations before telling what it perceives between the lines. Trumps super-intel is naturally attuned to other persons subconscious confessions. They do so in the symbolic language of protests too much denial and log in your eye projection along with other key imagery. His super-intel quick reads Hillary Clinton and the media and conveys hi

In [74]:
# data['text'] = data['text'].apply(pre_process)

In [7]:
# data

In [5]:
doc = cleaned_text

### Get Post-tag

In [4]:
def get_pos_and_tag(text):
    tag_arr = np.zeros(0)
    pos_arr = np.zeros(0)
    for i in text:
        pos_arr = np.append(pos_arr, i.pos_)
        tag_arr = np.append(tag_arr, i.tag_)        
    return (pos_arr, tag_arr)

In [5]:
def get_features(raw_text): #raw_text is original text   
    text = pre_process(raw_text)
    words = [token.text for token in text if token.is_punct != True]
    word_counter = Counter(([word for word in words]))
    sorted_word_counts = list(sorted(word_counter.values(), reverse = True)) #sorted in descending order
    
    pos_arr, tag_arr = get_pos_and_tag(text)
    pos_counter = Counter(([pos for pos in pos_arr]))
    tag_counter = Counter(([tag for tag in tag_arr]))

    total_count = sum(tag_counter.values()) #same for tag and pos
    
    ###features
    total_word_count = sum(word_counter.values())
    
    if total_word_count == 0:
        avg_word_length = 0
        lexical_diversity = 0
        repetition_top = 0
        repetition_all = 0
    else:
        avg_word_length = sum(len(word) for word in words)/total_word_count
        lexical_diversity = len(word_counter)/total_word_count
        #sum of number of words of top 20 words seen over total number of words
        repetition_top = sum(sorted_word_counts[:20])/total_word_count
        #1/k weighting on sum of word counts over total number of words
        repetition_all = sum(sorted_word_counts[i]/(i+1) for i in range(len(sorted_word_counts)))/total_word_count
    
    if total_count == 0:
        NNP_percent = 0
        NNPS_percent = 0
        noun_percent = 0
        verb_percent = 0
        part_percent = 0
        det_percent = 0
        unknown_or_foreign_percent = 0
    else:
        #tag percents
        NNP_percent = tag_counter.get("NNP", 0)/total_count
        NNPS_percent = tag_counter.get("NNPS", 0)/total_count
        #POS percents
        noun_percent = pos_counter.get("NOUN", 0)/total_count
        verb_percent = pos_counter.get("VERB", 0)/total_count
        part_percent = pos_counter.get("PART", 0)/total_count
        det_percent = pos_counter.get("DET", 0)/total_count
        unknown_or_foreign_percent = pos_counter.get("X", 0)/total_count
        
    return [total_word_count,avg_word_length,lexical_diversity,repetition_top,repetition_all,\
            NNP_percent,NNPS_percent,noun_percent,verb_percent,part_percent,det_percent,unknown_or_foreign_percent]

In [14]:
dct = {}

dct['total_word_count'],dct['avg_word_length'],dct['lexical_diversity'],dct['repetition_top'],\
dct['repetition_all'],dct['NNP_percent'],dct['NNPS_percent'],dct['noun_percent'],\
dct['verb_percent'],dct['part_percent'],dct['det_percent'],dct['unknown_or_foreign_percent']\
= zip(*data.iloc[10000:].text.apply(get_features))

In [15]:
new_df = pd.DataFrame(dct)

In [16]:
new_df.shape

(5679, 12)

In [17]:
new_df.head()

Unnamed: 0,total_word_count,avg_word_length,lexical_diversity,repetition_top,repetition_all,NNP_percent,NNPS_percent,noun_percent,verb_percent,part_percent,det_percent,unknown_or_foreign_percent
0,10,5.2,1.0,1.0,0.292897,0.545455,0.0,0.090909,0.0,0.0,0.090909,0.0
1,2465,4.956592,0.382556,0.345233,0.139596,0.099549,0.006386,0.182569,0.126972,0.019534,0.131104,0.000751
2,345,4.162319,0.565217,0.382609,0.119176,0.049351,0.0,0.166234,0.207792,0.031169,0.098701,0.0
3,932,5.101931,0.443133,0.335837,0.136687,0.12,0.002927,0.201951,0.149268,0.018537,0.126829,0.0
4,569,4.889279,0.56239,0.328647,0.110294,0.135937,0.009375,0.151562,0.151562,0.025,0.11875,0.0


In [18]:
new_df.to_csv("postag_output_10000_end.csv", index = False)

#  can ignore all below

### noun percents

In [14]:
def NNP_percent(postag_count):
    return postag_count.get("NNP") / sum(postag_count.values())

def NNPS_percent(postag_count):
    return postag_count.get("NNPS") / sum(postag_count.values())

144
1606
0.0896637608966376


### General version (postag)

In [88]:
def postag_percent(postag_count, postag_string):
    return postag_count.get(postag_string) / sum(postag_count.values())
    

### Verb count

In [114]:
def get_postag(doc):
    pos_arr = np.zeros(0)
    for i in doc:
        pos_arr = np.append(postag_arr, i.pos_)
    
    return pos_arr
    

def verb_count(pos_arr):
    pos_count = Counter(([pos for pos in pos_arr]))
    verb_count = pos_count.get("VERB") / sum(pos_count.values())
    # print(pos_count)
    # print(pos_count.get("VERB"))
    # print(sum(pos_count.values()))
    # print(verb_count)
    
    return verb_count

#(ver)

In [23]:
pos_arr

array(['NOUN', 'NOUN', 'PROPN', ..., 'ADJ', 'NOUN', 'PUNCT'], dtype='<U32')

### General version (pos) (less powerful version of postag)

In [24]:
def pos_count(pos_arr, pos_string):
    pos_count = Counter(([pos for pos in pos_arr]))
    result = pos_count.get(pos_string) / sum(pos_count.values())
    # print(pos_count)
    # print(pos_count.get("VERB"))
    # print(sum(pos_count.values()))
    # print(verb_count)
    
    return result

### Average Word Length

In [90]:
# token.is_stop != True
words =  [token.text for token in doc if token.is_punct != True]
# print(words)

def average_word_length(words):
    count = 0
    
    for word in words:     
        count += len(word)
        
    return count / len(words)

# print(words)
print(average_word_length(words))

5.136521136521137


### Average word length

In [116]:
def average_word_count(word_counter):
    # print(word_counter.values())
    return np.array(list(word_counter.values())).mean()

# print(words)
print(average_word_count(word_counter))

2.137777777777778


### Word count 

In [117]:
def word_count(word_counter):
    return sum(word_counter.values())

# print(word_count(words))
print(sum(word_counter.values()))

1443


### Number of unique words

In [112]:
def unique_words(word_counter):
    return len(word_counter)

print(unique_words(counts))

686


### Lexical Diversity

In [None]:
def lexical_diversity(word_counter):
    return unique_words(word_counter) / word_count(word_counter)

### Sentence Count

In [26]:
list(doc.sents)

def sentence_count(doc):
    return len(list(doc.sents))

sentence_count(doc)

65

In [6]:
doc.sents

NameError: name 'doc' is not defined

## Split to train and validation data

In [28]:
x, y = data.text, data.label

x_train, x_val, y_train, y_val = train_test_split(x, y, 
                                                  stratify=y, 
                                                  random_state=0, 
                                                  test_size=0.1, shuffle=True)



In [29]:
x_train.shape

(27208,)

In [30]:
x.shape

(30232,)

In [31]:
doc = nlp(x[0])

In [32]:
postag_arr = np.zeros(0)
for i in doc:
    postag_arr = np.append(postag_arr, i.tag_)

In [33]:
postag_arr

array(['NN', 'NN', 'NNP', ..., 'JJ', 'NN', '.'], dtype='<U32')