In [2]:
import spacy
import re
import numpy as np
import pandas as pd

#sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import log_loss, recall_score, precision_score

nlp = spacy.load('en')

In [3]:
data = pd.read_csv("combined_data.csv", sep = ";")
data.head()


Unnamed: 0,text,label
0,Image copyright Getty Images\nOn Sunday mornin...,1
1,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,The feud broke into public view last week when...,1
3,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,"Country singer Jason Aldean, who was performin...",1


### WJ processed version (Note that combined_data.csv still needed for extracting type of words)

#### In my notebook, this is not used.

In [4]:
data2 = pd.read_csv("processed.csv", sep = ";")
data2.head()

Unnamed: 0,text,label
0,imag copyright getti imag On sunday morn donal...,1
1,london reuter last flag fli a comedydrama abou...,1
2,the feud break into public view last week when...,1
3,mexico citi reuter egypt cheiron hold limit wi...,1
4,In 2012 kansa lawmak lead by gov sam brownback...,1


In [5]:
data_text = data.text[0]

def pre_process1(data_text):
    ex = data_text
    ex = ex.replace("\n", " ")
    doc = nlp(ex)
    
    return doc

# pre_process1(data_text)

In [6]:
# data['text'] = data['text'].apply(pre_process1)

In [7]:
# data

In [8]:
doc = pre_process1(data_text)

# doc = data[0]
doc

Image copyright Getty Images On Sunday morning, Donald Trump went off on a Twitter tirade against a member of his own party. This, in itself, isn't exactly huge news. It's far from the first time the president has turned his rhetorical cannons on his own ranks. This time, however, his attacks were particularly biting and personal. He essentially called Tennessee Senator Bob Corker, the chair of the powerful Senate Foreign Relations Committee, a coward for not running for re-election. He said Mr Corker "begged" for the president's endorsement, which he refused to give. He wrongly claimed that Mr Corker's support of the Iranian nuclear agreement was his only political accomplishment. Unlike some of his colleagues, Mr Corker - free from having to worry about his immediate political future - didn't hold his tongue. Skip Twitter post by @SenBobCorker It's a shame the White House has become an adult day care center. Someone obviously missed their shift this morning. — Senator Bob Corker (@Se

### Get Post-tag

In [9]:
# Get Post-tag

postag_arr = np.zeros(0)
for i in doc:
    postag_arr = np.append(postag_arr, i.tag_)

In [10]:
postag_arr

array(['NN', 'NN', 'NNP', ..., 'JJ', 'NN', '.'], dtype='<U32')

In [40]:
doc

Image copyright Getty Images
On Sunday morning, Donald Trump went off on a Twitter tirade against a member of his own party.
This, in itself, isn't exactly huge news. It's far from the first time the president has turned his rhetorical cannons on his own ranks.
This time, however, his attacks were particularly biting and personal. He essentially called Tennessee Senator Bob Corker, the chair of the powerful Senate Foreign Relations Committee, a coward for not running for re-election.
He said Mr Corker "begged" for the president's endorsement, which he refused to give. He wrongly claimed that Mr Corker's support of the Iranian nuclear agreement was his only political accomplishment.
Unlike some of his colleagues, Mr Corker - free from having to worry about his immediate political future - didn't hold his tongue.
Skip Twitter post by @SenBobCorker It's a shame the White House has become an adult day care center. Someone obviously missed their shift this morning. — Senator Bob Corker (@Se

In [11]:
from collections import Counter
postag_count = Counter(([postag for postag in postag_arr]))
postag_count

Counter({"''": 16,
         ',': 47,
         '-LRB-': 1,
         '-RRB-': 1,
         '.': 51,
         ':': 9,
         'ADD': 1,
         'CC': 30,
         'CD': 8,
         'DT': 102,
         'EX': 3,
         'HYPH': 12,
         'IN': 125,
         'JJ': 65,
         'JJR': 4,
         'JJS': 1,
         'MD': 10,
         'NFP': 1,
         'NN': 182,
         'NNP': 134,
         'NNPS': 6,
         'NNS': 40,
         'PDT': 2,
         'POS': 18,
         'PRP': 42,
         'PRP$': 22,
         'RB': 70,
         'RBR': 1,
         'RBS': 1,
         'RP': 9,
         'TO': 24,
         'VB': 47,
         'VBD': 17,
         'VBG': 27,
         'VBN': 19,
         'VBP': 17,
         'VBZ': 59,
         'WDT': 7,
         'WP': 5,
         'WRB': 5,
         '``': 2})

### NNP_percent

In [12]:
def NNP_percent(postag_count):
    return postag_count.get("NNP") / sum(postag_count.values())


print(postag_count.get("NNP"))
print(sum(postag_count.values()))
print(NNP_percent(postag_count))

134
1243
0.10780370072405471


### NNPS_percent

In [13]:
def NNPS_percent(postag_count):
    return postag_count.get("NNPS") / sum(postag_count.values())


print(postag_count.get("NNPS"))
print(sum(postag_count.values()))
print(NNPS_percent(postag_count))

6
1243
0.004827031375703942


### General version (postag)

In [14]:
def postag_percent(postag_count, postag_string):
    return postag_count.get(postag_string) / sum(postag_count.values())
    

### Average Word Length

In [15]:
# token.is_stop != True
words =  [token.text for token in doc if token.is_punct != True]
print(words)

def average_word_length(words):
    count = 0
    
    for word in words:     
        count += len(word)
        
    return count / len(words)

# print(words)
print(average_word_length(words))

['Image', 'copyright', 'Getty', 'Images', 'On', 'Sunday', 'morning', 'Donald', 'Trump', 'went', 'off', 'on', 'a', 'Twitter', 'tirade', 'against', 'a', 'member', 'of', 'his', 'own', 'party', 'This', 'in', 'itself', 'is', "n't", 'exactly', 'huge', 'news', 'It', "'s", 'far', 'from', 'the', 'first', 'time', 'the', 'president', 'has', 'turned', 'his', 'rhetorical', 'cannons', 'on', 'his', 'own', 'ranks', 'This', 'time', 'however', 'his', 'attacks', 'were', 'particularly', 'biting', 'and', 'personal', 'He', 'essentially', 'called', 'Tennessee', 'Senator', 'Bob', 'Corker', 'the', 'chair', 'of', 'the', 'powerful', 'Senate', 'Foreign', 'Relations', 'Committee', 'a', 'coward', 'for', 'not', 'running', 'for', 're', 'election', 'He', 'said', 'Mr', 'Corker', 'begged', 'for', 'the', 'president', "'s", 'endorsement', 'which', 'he', 'refused', 'to', 'give', 'He', 'wrongly', 'claimed', 'that', 'Mr', 'Corker', "'s", 'support', 'of', 'the', 'Iranian', 'nuclear', 'agreement', 'was', 'his', 'only', 'politi


4.735454545454545


### Sorted Word Distribution / Word Count (sorted by most common) (useful when combined with actual pre processed text by WJ)

In [16]:
word_counter = Counter(words)
word_counter.most_common()


[('the', 50),
 ('to', 31),
 ("'s", 29),
 ('of', 24),
 ('a', 21),
 ('Mr', 21),
 ('president', 20),
 ('and', 19),
 ('his', 18),
 ('in', 18),
 ('is', 15),
 ('Corker', 14),
 ('that', 12),
 ('it', 12),
 ('Trump', 11),
 ('for', 11),
 ('he', 11),
 ('on', 9),
 ("n't", 8),
 ('has', 8),
 ('Image', 7),
 ('with', 7),
 ('The', 6),
 ('at', 6),
 ('or', 6),
 ('not', 5),
 ('have', 5),
 ('are', 5),
 ('out', 5),
 ('be', 5),
 ('been', 5),
 ('Kelly', 5),
 ('copyright', 4),
 ('off', 4),
 ('Twitter', 4),
 ('He', 4),
 ('Bob', 4),
 ('was', 4),
 ('White', 4),
 ('House', 4),
 ('an', 4),
 ('this', 4),
 ('does', 4),
 ('but', 4),
 ('now', 4),
 ('what', 4),
 ('will', 4),
 ('Getty', 3),
 ('Images', 3),
 ('own', 3),
 ('from', 3),
 ('Senate', 3),
 ('election', 3),
 ('which', 3),
 ('only', 3),
 ('some', 3),
 ('That', 3),
 ('why', 3),
 ('they', 3),
 ('know', 3),
 ('everyone', 3),
 ('caption', 3),
 ('campaign', 3),
 ('rally', 3),
 ('until', 3),
 ('can', 3),
 ('all', 3),
 ('during', 3),
 ('administration', 3),
 ('as', 3),


In [17]:
?doc.count_by

### Average word count

In [18]:
def average_word_count(word_counter):
    print(word_counter.values())
    return np.array(list(word_counter.values())).mean()

# print(words)
print(average_word_count(word_counter))

dict_values([7, 4, 3, 3, 1, 1, 2, 2, 11, 1, 4, 9, 21, 4, 1, 2, 1, 24, 18, 3, 2, 2, 18, 1, 15, 8, 1, 1, 2, 2, 29, 1, 3, 50, 1, 2, 20, 8, 1, 1, 1, 1, 1, 1, 1, 2, 1, 19, 1, 4, 1, 1, 2, 2, 4, 14, 1, 2, 3, 1, 1, 1, 1, 11, 5, 2, 2, 3, 2, 21, 1, 2, 3, 11, 1, 31, 2, 1, 1, 12, 1, 1, 2, 2, 4, 3, 2, 1, 1, 3, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 4, 4, 1, 4, 1, 2, 1, 1, 1, 1, 1, 2, 1, 4, 1, 1, 1, 1, 3, 1, 12, 2, 2, 1, 7, 1, 1, 2, 2, 1, 5, 1, 5, 1, 1, 1, 2, 1, 3, 3, 1, 2, 2, 3, 1, 5, 2, 2, 2, 4, 3, 2, 4, 1, 1, 1, 1, 1, 1, 6, 1, 2, 1, 1, 1, 1, 2, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 6, 3, 3, 2, 1, 1, 1, 2, 1, 2, 4, 2, 2, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 2, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 4, 1, 1, 2, 1, 2, 5, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1

### Word count 

In [19]:
def word_count(words):
    return len(words)

word_count(words)


1100

### Number of unique words

In [20]:
from spacy.attrs import ORTH
counts = doc.count_by(ORTH)
counts


def unique_words(counts):
    return len(counts)

print(unique_words(counts))




542


In [21]:
word_counter

Counter({"'ll": 1,
         "'re": 1,
         "'s": 29,
         "'ve": 1,
         '2016': 1,
         '2017': 1,
         '2019': 1,
         '52': 1,
         '8': 1,
         '@SenBobCorker': 2,
         'A': 1,
         'Afghanistan': 1,
         'Although': 1,
         'And': 1,
         'Bob': 4,
         'Charlottesville': 1,
         'Chief': 1,
         'Committee': 1,
         'Congress': 2,
         'Corker': 14,
         'Democrats': 1,
         'Doing': 1,
         'Donald': 2,
         'Even': 1,
         'For': 1,
         'Foreign': 1,
         'From': 1,
         'Frustration': 1,
         'Getting': 1,
         'Getty': 3,
         'Gulf': 1,
         'He': 4,
         'Here': 1,
         'House': 4,
         'I': 2,
         'If': 2,
         'Image': 7,
         'Images': 3,
         'Iran': 1,
         'Iranian': 1,
         'It': 2,
         'January': 1,
         'John': 2,
         'July': 2,
         'Kelly': 5,
         'Korea': 1,
         'Korean': 1,
    

### Verb count

In [22]:
pos_arr = np.zeros(0)
for i in doc:
    pos_arr = np.append(pos_arr, i.pos_)
    
pos_arr

def verb_count(pos_arr):
    pos_count = Counter(([pos for pos in pos_arr]))
    verb_count = pos_count.get("VERB") / sum(pos_count.values())
    # print(pos_count)
    # print(pos_count.get("VERB"))
    # print(sum(pos_count.values()))
    # print(verb_count)
    
    return verb_count

In [23]:
pos_arr

array(['NOUN', 'NOUN', 'PROPN', ..., 'ADJ', 'NOUN', 'PUNCT'], dtype='<U32')

### General version (pos) (less powerful version of postag)

In [24]:
def pos_count(pos_arr, pos_string):
    pos_count = Counter(([pos for pos in pos_arr]))
    result = pos_count.get(pos_string) / sum(pos_count.values())
    # print(pos_count)
    # print(pos_count.get("VERB"))
    # print(sum(pos_count.values()))
    # print(verb_count)
    
    return result

### Might be of importance!!! (Similarity of 2 texts function) https://spacy.io/api/doc

In [38]:
nlp1 = nlp(u"An apple a day keeps the doctor away.")
nlp2 = nlp(u"How many doctors eat apples everyday?")

nlp1.similarity(nlp2)

# How to deal with model warning?

nlp3 = nlp(u"123")
nlp4 = nlp(u"1234")

nlp3.similarity(nlp4)

  "__main__", mod_spec)
  "__main__", mod_spec)


0.8520246756820952

### Sentence Count

In [26]:
list(doc.sents)

def sentence_count(doc):
    return len(list(doc.sents))

sentence_count(doc)

65

### What is skipped for now: sensory ratio, spatial and temporal ratio, and imagery

In [27]:
### Sentence 

## Split to train and validation data

In [28]:
x, y = data.text, data.label

x_train, x_val, y_train, y_val = train_test_split(x, y, 
                                                  stratify=y, 
                                                  random_state=0, 
                                                  test_size=0.1, shuffle=True)



In [29]:
x_train.shape

(27208,)

In [30]:
x.shape

(30232,)

In [31]:
doc = nlp(x[0])

In [32]:
postag_arr = np.zeros(0)
for i in doc:
    postag_arr = np.append(postag_arr, i.tag_)

In [33]:
postag_arr

array(['NN', 'NN', 'NNP', ..., 'JJ', 'NN', '.'], dtype='<U32')

In [34]:
def NN_percent(x):
    return sum([k == "NN" for k in x])/ len(x)

In [35]:
words

['Image',
 'copyright',
 'Getty',
 'Images',
 'On',
 'Sunday',
 'morning',
 'Donald',
 'Trump',
 'went',
 'off',
 'on',
 'a',
 'Twitter',
 'tirade',
 'against',
 'a',
 'member',
 'of',
 'his',
 'own',
 'party',
 'This',
 'in',
 'itself',
 'is',
 "n't",
 'exactly',
 'huge',
 'news',
 'It',
 "'s",
 'far',
 'from',
 'the',
 'first',
 'time',
 'the',
 'president',
 'has',
 'turned',
 'his',
 'rhetorical',
 'cannons',
 'on',
 'his',
 'own',
 'ranks',
 'This',
 'time',
 'however',
 'his',
 'attacks',
 'were',
 'particularly',
 'biting',
 'and',
 'personal',
 'He',
 'essentially',
 'called',
 'Tennessee',
 'Senator',
 'Bob',
 'Corker',
 'the',
 'chair',
 'of',
 'the',
 'powerful',
 'Senate',
 'Foreign',
 'Relations',
 'Committee',
 'a',
 'coward',
 'for',
 'not',
 'running',
 'for',
 're',
 'election',
 'He',
 'said',
 'Mr',
 'Corker',
 'begged',
 'for',
 'the',
 'president',
 "'s",
 'endorsement',
 'which',
 'he',
 'refused',
 'to',
 'give',
 'He',
 'wrongly',
 'claimed',
 'that',
 'Mr',
 'C

## Tokenize and remove stop words

In [36]:


doc = nlp(ex)

words =  [token.text for token in doc if token.is_stop != True and token.is_punct != True]

NameError: name 'ex' is not defined

In [None]:
words

## Tokenize, remove stop words and lemmatize

In [None]:
temp = str()
for token in doc:
    temp += " " + token.lemma_

temp.strip()

## Tokenize, remove stop words and stem

In [None]:
import nltk
from nltk.stem.porter import *

In [None]:
stemmer = PorterStemmer()  

counter = 0
stem_words = []
for token in words: 
    stem_words.append(stemmer.stem(token))
    counter += 1
    
res_stem = str()
for i in stem_words:
    res_stem += " " + i
    
nlp(res_stem.strip())

### Get PoS Tag

In [None]:
postag_arr = np.zeros(0)
for i in doc:
    postag_arr = np.append(postag_arr, i.tag_)

### Get Common Words

In [None]:
from collections import Counter

word_freq = Counter(words)
common_words = word_freq.most_common(5)
print(common_words)

### Get NER

In [None]:
labels = set([w.label_ for w in doc.ents]) 
for label in labels: 
    entities = [e.string for e in doc.ents if label==e.label_] 
    entities = list(set(entities)) 
    print(label,entities)

## TF-IDF basic model

tf-idf on the lemmatized text.

In [None]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(x2_train))
x2_train_tfv =  tfv.transform(x2_train) 
x2_val_tfv = tfv.transform(x2_val)

In [None]:
clf = LogisticRegression(C=1.0, solver = 'liblinear')
clf.fit(x2_train_tfv, y2_train)

In [None]:
x_val_tfv = tfv.transform(x_val)

In [None]:
clf.score(x2_val_tfv, y2_val)

In [None]:
y2_pred = clf.predict(x2_val_tfv)

print("Precision: " + str(precision_score(y2_val, y2_pred)))
print("Recall: " + str(recall_score(y2_val, y2_pred)))

HAVEN'T TRY BELOW!!

In [None]:
from gensim.corpora import Dictionary
from gensim.models.tfidfmodel import TfidfModel

docs_dict = Dictionary(docs)
docs_dict.filter_extremes(no_below=20, no_above=0.2)
docs_dict.compactify()

docs_corpus = [docs_dict.doc2bow(doc) for doc in docs]
model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict)
docs_tfidf  = model_tfidf[docs_corpus]
docs_vecs   = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_tfidf])