In [129]:
import spacy
import re
import numpy as np
import pandas as pd

#sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import log_loss, recall_score, precision_score

nlp = spacy.load('en')

In [216]:
df = pd.read_csv("dataset.csv", sep = ",")

## Split to train and validation data

In [219]:
x, y = df.text, df.label

x_train, x_val, y_train, y_val = train_test_split(x, y, 
                                                  stratify=y, 
                                                  random_state=0, 
                                                  test_size=0.2)

## Tokenize and remove stop words

In [27]:
doc = nlp(ex)

words =  [token.text for token in doc if token.is_stop != True and token.is_punct != True]

In [28]:
words

['Image',
 'copyright',
 'Getty',
 'Images',
 'Sunday',
 'morning',
 'Donald',
 'Trump',
 'went',
 'Twitter',
 'tirade',
 'member',
 'party',
 'exactly',
 'huge',
 'news',
 'far',
 'time',
 'president',
 'turned',
 'rhetorical',
 'cannons',
 'ranks',
 'time',
 'attacks',
 'particularly',
 'biting',
 'personal',
 'essentially',
 'called',
 'Tennessee',
 'Senator',
 'Bob',
 'Corker',
 'chair',
 'powerful',
 'Senate',
 'Foreign',
 'Relations',
 'Committee',
 'coward',
 'running',
 'election',
 'said',
 'Mr',
 'Corker',
 'begged',
 'president',
 'endorsement',
 'refused',
 'wrongly',
 'claimed',
 'Mr',
 'Corker',
 'support',
 'Iranian',
 'nuclear',
 'agreement',
 'political',
 'accomplishment',
 'Unlike',
 'colleagues',
 'Mr',
 'Corker',
 'free',
 'having',
 'worry',
 'immediate',
 'political',
 'future',
 'hold',
 'tongue',
 'Skip',
 'Twitter',
 'post',
 '@SenBobCorker',
 'shame',
 'White',
 'House',
 'adult',
 'day',
 'care',
 'center',
 'obviously',
 'missed',
 'shift',
 'morning',
 'Se

## Tokenize, remove stop words and lemmatize

In [29]:
temp = str()
for token in doc:
    temp += " " + token.lemma_

temp.strip()

'image copyright Getty Images On Sunday morning , Donald Trump go off on a Twitter tirade against a member of -PRON- own party . This , in -PRON- , be not exactly huge news . -PRON- be far from the first time the president have turn -PRON- rhetorical cannon on -PRON- own rank . This time , however , -PRON- attack be particularly biting and personal . -PRON- essentially call Tennessee Senator Bob Corker , the chair of the powerful Senate Foreign Relations Committee , a coward for not run for re - election . -PRON- say Mr Corker " beg " for the president \'s endorsement , which -PRON- refuse to give . -PRON- wrongly claim that Mr Corker \'s support of the iranian nuclear agreement be -PRON- only political accomplishment . unlike some of -PRON- colleague , Mr Corker - free from have to worry about -PRON- immediate political future - do not hold -PRON- tongue . Skip Twitter post by @SenBobCorker -PRON- be a shame the White House have become an adult day care center . Someone obviously miss

## Tokenize, remove stop words and stem

In [34]:
import nltk
from nltk.stem.porter import *

In [35]:
stemmer = PorterStemmer()  

counter = 0
stem_words = []
for token in words: 
    stem_words.append(stemmer.stem(token))
    counter += 1
    
res_stem = str()
for i in stem_words:
    res_stem += " " + i
    
nlp(res_stem.strip())

imag copyright getti imag sunday morn donald trump went twitter tirad member parti exactli huge news far time presid turn rhetor cannon rank time attack particularli bite person essenti call tennesse senat bob corker chair power senat foreign relat committe coward run elect said Mr corker beg presid endors refus wrongli claim Mr corker support iranian nuclear agreement polit accomplish unlik colleagu Mr corker free have worri immedi polit futur hold tongu skip twitter post @senbobcork shame white hous adult day care center obvious miss shift morn senat bob corker @senbobcork octob 8 2017 report end spoke new york time let presid choic quot tennesse senat interview time particularli damn know presid tweet thing true know know sugarcoat Mr corker flat say presid liar know senat particular challeng Mr trump insist unsuccess plead endors accus broader Mr corker presid akin alli tennessean Mr trump short list vice presid secretari state imag copyright getti imag imag caption bob corker trum

### Get PoS Tag

In [37]:
postag_arr = np.zeros(0)
for i in doc:
    postag_arr = np.append(postag_arr, i.tag_)

### Get Common Words

In [39]:
from collections import Counter

word_freq = Counter(words)
common_words = word_freq.most_common(5)
print(common_words)

[('Mr', 21), ('president', 20), ('Corker', 14), ('Trump', 11), ('Image', 7)]


### Get NER

In [233]:
labels = set([w.label_ for w in doc.ents]) 
for label in labels: 
    entities = [e.string for e in doc.ents if label==e.label_] 
    entities = list(set(entities)) 
    print(label,entities)

NORP ['Republicans ', 'Democrats ']
LOC ['Persian Gulf ']
CARDINAL ['52 ']
ORG ['Senate Foreign Relations Committee ', 'Obamacare ', 'White House ', 'Reuters Image ', 'Senate ', 'Taliban ', 'Times ', 'Skip ', 'Getty Images Image ', 'Mr Trump ', 'New York Times ', 'Congress ']
PERSON ['Nixon ', 'United Nations Mr Kelly ', 'Tennessean Mr Trump ', 'Mr Kelly ', 'Bob Corker ', 'Image ', 'Corker ', 'Time Mr Trump ', 'John Kelly ', 'Donald Trump ', 'Mr McConnell ', 'Charlottesville Mr Kelly ', 'Getty Images Image ', 'Rex Tillerson ', 'Mr Corker ', 'Getty Images ', 'Mr Trump ']
PRODUCT ['Mr Corker ']
GPE ['Qatar ', 'Tennessee ', 'Twitter ', 'Iran ', 'West Wing ', 'Venezuela ', 'Afghanistan ', 'North Korea ']
DATE ['January 2019 ', 'October 8 2017 ', 'year ', 'Sunday ', 'July 2016 day ', 'July ']
TIME ['morning ']


## TF-IDF basic model

tf-idf on the lemmatized text.

In [175]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(x2_train))
x2_train_tfv =  tfv.transform(x2_train) 
x2_val_tfv = tfv.transform(x2_val)

In [176]:
clf = LogisticRegression(C=1.0, solver = 'liblinear')
clf.fit(x2_train_tfv, y2_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [172]:
x_val_tfv = tfv.transform(x_val)

In [177]:
clf.score(x2_val_tfv, y2_val)

ValueError: X has 187212 features per sample; expecting 95156

In [170]:
y2_pred = clf.predict(x2_val_tfv)

print("Precision: " + str(precision_score(y2_val, y2_pred)))
print("Recall: " + str(recall_score(y2_val, y2_pred)))

Precision: 0.9444444444444444
Recall: 0.9116719242902208


HAVEN'T TRY BELOW!!

In [236]:
from gensim.corpora import Dictionary
from gensim.models.tfidfmodel import TfidfModel

docs_dict = Dictionary(docs)
docs_dict.filter_extremes(no_below=20, no_above=0.2)
docs_dict.compactify()

docs_corpus = [docs_dict.doc2bow(doc) for doc in docs]
model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict)
docs_tfidf  = model_tfidf[docs_corpus]
docs_vecs   = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_tfidf])

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress
