In [71]:
import os
import string

import pandas as pd
import numpy as np
import scipy
import seaborn as sns
import gensim
import nltk
import re
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from textblob import TextBlob

from ydata_profiling import ProfileReport
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
from nltk import tokenize
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from IPython.display import Image
from spacy import displacy
from transformers import pipeline
%matplotlib inline

In [39]:
import warnings

warnings.filterwarnings('ignore')

In [40]:
df = pd.read_csv('data.csv')
df

Unnamed: 0,text,clean_title,date,label,clean_text,hard_clean_text
0,The head of a conservative Republican faction ...,us budget fight looms republicans flip fiscal...,2017-12-31,1,the head of a conservative republican faction ...,head conservative republican faction us congr...
1,Transgender people will be allowed for the fir...,us military accept transgender recruits monday...,2017-12-29,1,transgender people will be allowed for the fir...,transgender people allowed first time enlist u...
2,The special counsel investigation of links bet...,senior us republican senator 'let mr mueller job',2017-12-31,1,the special counsel investigation of links bet...,special counsel investigation links russia pr...
3,Trump campaign adviser George Papadopoulos tol...,fbi russia probe helped australian diplomat ti...,2017-12-30,1,trump campaign adviser george papadopoulos tol...,trump campaign adviser george papadopoulos tol...
4,President Donald Trump called on the U.S. Post...,trump wants postal service charge 'much ' amaz...,2017-12-29,1,president donald trump called on the us postal...,president donald trump called us postal servic...
...,...,...,...,...,...,...
44599,21st Century Wire says As 21WIRE reported earl...,mcpain john mccain furious iran treated us sai...,2016-01-16,0,21st century wire says as 21wire reported earl...,21st century wire says 21wire reported earlier...
44600,21st Century Wire says It s a familiar theme. ...,justice yahoo settles email privacy classactio...,2016-01-16,0,21st century wire says it s a familiar theme w...,21st century wire says familiar theme whenever...
44601,Patrick Henningsen 21st Century WireRemember ...,sunnistan us allied safe zone plan take territ...,2016-01-15,0,patrick henningsen 21st century wireremember w...,patrick henningsen 21st century wireremember o...
44602,21st Century Wire says Al Jazeera America will...,blow 700 million al jazeera america finally c...,2016-01-14,0,21st century wire says al jazeera america will...,21st century wire says al jazeera america go h...


In [41]:
train, test = train_test_split(df, test_size=.2)

In [42]:
print(train.shape)
train.head()

(35683, 6)


Unnamed: 0,text,clean_title,date,label,clean_text,hard_clean_text
19325,The United Nations launched a roadmap on Wedne...,unveiling new libya plan un sees opportunity p...,2017-09-20,1,the united nations launched a roadmap on wedne...,united nations launched roadmap wednesday ren...
36476,Remember the promise of a fence on our souther...,heck happened 2006 secure border fence act,2015-08-16,0,remember the promise of a fence on our souther...,remember promise fence southern border yes pla...
16650,Czech billionaire businessman Andrej Babis s A...,billionaire' ano party holding big lead czech ...,2017-10-21,1,czech billionaire businessman andrej babis s a...,czech billionaire businessman andrej babis ano...
20968,"The Syrian Democratic Forces (SDF), an allianc...",usbacked forces syria' raqqa say take old city,2017-09-01,1,the syrian democratic forces sdf an alliance o...,syrian democratic forces sdf alliance kurdish...
23137,His accusation that President Obama wiretapped...,trump makes absolute fool april fools day ame...,2017-04-01,0,his accusation that president obama wiretapped...,accusation president obama wiretapped thoroug...


In [43]:
print(test.shape)
test.head()

(8921, 6)


Unnamed: 0,text,clean_title,date,label,clean_text,hard_clean_text
22562,Trump is not the only U.S. leader in Europe ri...,politico threw major shade trump merkels meet...,2017-05-25,0,trump is not the only us leader in europe righ...,trump us leader europe right former president ...
18237,The United States will soon decide whether to ...,us decide soon future taliban office qatar,2017-10-03,1,the united states will soon decide whether to ...,united states soon decide whether keep open t...
443,"U.S. Senator Tom Cotton, a hawkish Iraq war ve...",factbox five facts tom cotton trump' likely pi...,2017-11-30,1,us senator tom cotton a hawkish iraq war veter...,us senator tom cotton hawkish iraq war veteran...
18167,British Prime Minister Theresa May said on Wed...,uk pm may must fight political mainstream,2017-10-04,1,british prime minister theresa may said on wed...,british prime minister theresa may said wednes...
2588,The Kremlin said on Monday it was worried that...,kremlin frets us sanctions may hurt european p...,2017-07-24,1,the kremlin said on monday it was worried that...,kremlin said monday worried proposed new us s...


In [99]:
def train_pipeline(model, vectorizer, data, labels, metric):
    all_train_preds = []
    all_test_preds = []
    all_train_targets = []
    all_test_targets = []
    folder = StratifiedKFold(shuffle=False)
    folder = folder.split(data, labels)
    for fold_id, (train_ids, test_ids) in enumerate(folder):
        pipeline = Pipeline(
            [
                ("vectorizer", vectorizer[0](*vectorizer[1], **vectorizer[2])),
                ("lof_reg", model[0](*model[1], **model[2]))
            ]
        )
        # with open('test.txt', 'a') as f:
        #     f.write(f'Iteration {fold_id}:\nTrain: {train_ids}\n\nTest: {test_ids}\n\n')
        pipeline.fit(data.iloc[train_ids], labels.iloc[train_ids])
        # First channel refers to Negative class and second channel refers to Postive class
        fold_train_preds = pipeline.predict(data.iloc[train_ids])
        fold_test_preds = pipeline.predict(data.iloc[test_ids])
        
        fold_train_targets = labels.iloc[train_ids].values
        fold_test_targets = labels.iloc[test_ids].values
        print(
            f"Fold {fold_id}\n"
            f"Train {metric.__name__} score = {metric(fold_train_targets, fold_train_preds)}\n"
            f"Test {metric.__name__} score = {metric(fold_test_targets, fold_test_preds)}"
        )
        all_train_preds.append(fold_train_preds)
        all_test_preds.append(fold_test_preds)
        all_train_targets.append(fold_train_targets)
        all_test_targets.append(fold_test_targets)
    
    all_train_preds = np.concatenate(all_train_preds)
    all_test_preds = np.concatenate(all_test_preds)
    all_train_targets = np.concatenate(all_train_targets)
    all_test_targets = np.concatenate(all_test_targets)
    
    print(f"\n\nOOF Train Score: {metric(all_train_targets, all_train_preds)}")
    print(f"OOF Test Score: {metric(all_test_targets, all_test_preds)}")

**Clean bag of words**

In [46]:
train_pipeline(
    (LogisticRegression, (), {}),
    (CountVectorizer, (), {'stop_words': []}),
    df['clean_text'],
    df['label'],
    accuracy_score
)

Fold 0
Train accuracy_score score = 0.9998598772524732
Test accuracy_score score = 0.9827373612823674
Fold 1
Train accuracy_score score = 1.0
Test accuracy_score score = 0.9859881179239995
Fold 2
Train accuracy_score score = 0.9999719754504947
Test accuracy_score score = 0.9789261293576953
Fold 3
Train accuracy_score score = 1.0
Test accuracy_score score = 0.9822889810559354
Fold 4
Train accuracy_score score = 0.999915928707544
Test accuracy_score score = 0.9781390134529148

OOF Test Score: 0.9816159985651511
OOF Train Score: 0.9999495560936239


**Hard clean bag of words**

In [47]:
train_pipeline(
    (LogisticRegression, (), {}),
    (CountVectorizer, (), {'stop_words': []}),
    df['clean_text'],
    df['label'],
    accuracy_score
)

Fold 0
Train accuracy_score score = 0.9998598772524732
Test accuracy_score score = 0.9827373612823674
Fold 1
Train accuracy_score score = 1.0
Test accuracy_score score = 0.9859881179239995
Fold 2
Train accuracy_score score = 0.9999719754504947
Test accuracy_score score = 0.9789261293576953
Fold 3
Train accuracy_score score = 1.0
Test accuracy_score score = 0.9822889810559354
Fold 4
Train accuracy_score score = 0.999915928707544
Test accuracy_score score = 0.9781390134529148

OOF Test Score: 0.9816159985651511
OOF Train Score: 0.9999495560936239


**Clean tf-idf**

In [48]:
train_pipeline(
    (LogisticRegression, (), {}),
    (TfidfVectorizer, (), {'stop_words': []}),
    df['clean_text'],
    df['label'],
    accuracy_score
)

Fold 0
Train accuracy_score score = 0.9896589412325197
Test accuracy_score score = 0.9785898441878713
Fold 1
Train accuracy_score score = 0.9887341310988426
Test accuracy_score score = 0.9795986996973434
Fold 2
Train accuracy_score score = 0.9903315304206485
Test accuracy_score score = 0.9698464297724471
Fold 3
Train accuracy_score score = 0.9894907939354874
Test accuracy_score score = 0.970182714942271
Fold 4
Train accuracy_score score = 0.9883140903486156
Test accuracy_score score = 0.9720852017937219

OOF Test Score: 0.9740606223657071
OOF Train Score: 0.9893058918482648


**Hard clean tf-idf**

In [49]:
train_pipeline(
    (LogisticRegression, (), {}),
    (TfidfVectorizer, (), {'stop_words': []}),
    df['hard_clean_text'],
    df['label'],
    accuracy_score
)

Fold 0
Train accuracy_score score = 0.9882857383067567
Test accuracy_score score = 0.9757874677726712
Fold 1
Train accuracy_score score = 0.9876411736681333
Test accuracy_score score = 0.9754511826028472
Fold 2
Train accuracy_score score = 0.9894627693859821
Test accuracy_score score = 0.9615513955834548
Fold 3
Train accuracy_score score = 0.9888182047473587
Test accuracy_score score = 0.9655868176213429
Fold 4
Train accuracy_score score = 0.9875854724806636
Test accuracy_score score = 0.9704035874439462

OOF Test Score: 0.969756075688279
OOF Train Score: 0.9883586673840911


  - Bag of words (clean text): 0.9823
  - Bag of words (hard clean text): 0.9823
  - TF-IDF Vectorizer (clean text): 0.9745
  - TF-IDF Vectorizer (hard clean text): 0.96952

In [None]:
train_pipeline(
    (LogisticRegression, (), {'penalty': None}),
    (CountVectorizer, (), {'stop_words': None}),
    df['clean_text'],
    df['label'],
    accuracy_score
)

In [100]:
train_pipeline(
    (LogisticRegression, (), {'penalty': 'l2'}),
    (CountVectorizer, (), {'stop_words': None}),
    df['clean_title'] + " " + df['clean_text'],
    df['label'],
    accuracy_score
)

Fold 0
Train accuracy_score score = 1.0
Test accuracy_score score = 0.9820647909427195
Fold 1
Train accuracy_score score = 1.0
Test accuracy_score score = 0.9837462167918395
Fold 2
Train accuracy_score score = 1.0
Test accuracy_score score = 0.9793745095841273
Fold 3
Train accuracy_score score = 1.0
Test accuracy_score score = 0.9836341217352315
Fold 4
Train accuracy_score score = 1.0
Test accuracy_score score = 0.9752242152466367


OOF Train Score: 1.0
OOF Test Score: 0.9808088960631334


In [64]:
vectorizer1 = CountVectorizer()
vectorizer2 = CountVectorizer()
vectorizer1.fit(train['clean_text'])
vectorizer2.fit(train['clean_title'])

In [73]:
vectorizer1.transform(train['clean_text'])

<35683x199211 sparse matrix of type '<class 'numpy.int64'>'
	with 7439648 stored elements in Compressed Sparse Row format>

In [74]:
text = vectorizer1.transform(train['clean_text'])
title = vectorizer2.transform(train['clean_title'])
X = scipy.sparse.hstack([text, title])
y = train['label']

In [96]:
model = LogisticRegression()
model.fit(X, y)

In [97]:
text = vectorizer1.transform(test['clean_text'])
title = vectorizer2.transform(test['clean_title'])
X = scipy.sparse.hstack([text, title])
y = test['label']
model.score(X, y)

0.9919291559242237

In [111]:
former = SentenceTransformer('all-MiniLM-L12-v2', device="cuda")

Downloading (…)5dded/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)4d81d5dded/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)81d5dded/config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)ded/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)5dded/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

Downloading (…)dded/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)4d81d5dded/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)1d5dded/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [112]:
text = former.encode(df['clean_text'].tolist(), batch_size=256)
text

array([[ 0.03123388,  0.01750667, -0.0214767 , ..., -0.03805789,
         0.09431024,  0.03504995],
       [-0.05542544, -0.05065586, -0.02255202, ..., -0.02224375,
         0.10659005,  0.01045702],
       [-0.03618498, -0.01664655, -0.0502505 , ...,  0.00197449,
         0.0562633 , -0.03363869],
       ...,
       [ 0.03675877, -0.03895299, -0.07961769, ...,  0.04909951,
        -0.00211462, -0.06433941],
       [-0.00145515, -0.07306355, -0.07079151, ..., -0.03843977,
        -0.03805653, -0.05835051],
       [-0.01367379,  0.00102733, -0.07909339, ...,  0.01608275,
         0.01559997, -0.03030417]], dtype=float32)

In [113]:
title = former.encode(df['clean_title'].tolist(), batch_size=256)
title

array([[-8.5231252e-02,  9.5730238e-03, -6.3785076e-02, ...,
         1.7937198e-02,  7.7072091e-02, -8.3483867e-02],
       [-2.8596468e-02, -4.4652488e-02, -5.3099152e-02, ...,
        -1.0892407e-02,  5.9771994e-03, -7.7327132e-02],
       [ 1.9150501e-05,  8.9656683e-03, -4.5402136e-02, ...,
         8.7244935e-02,  5.8001060e-02, -7.5248621e-02],
       ...,
       [ 4.3863486e-02, -1.6502917e-02, -1.0250830e-01, ...,
        -5.6485575e-02, -5.3468879e-02, -1.0894861e-01],
       [ 3.3624991e-04,  4.6417769e-03, -4.0675271e-02, ...,
        -2.2161683e-02, -7.7746332e-02, -2.4693478e-02],
       [ 4.0371191e-02,  1.9496085e-02, -8.2294419e-02, ...,
         5.5840101e-02,  2.3495596e-02, -1.4319169e-02]], dtype=float32)

In [114]:
X = np.concatenate([text, title], axis=1)

In [115]:
y = df['label'].to_numpy()

In [116]:
X.shape, y.shape

((44604, 768), (44604,))

In [117]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
model = LogisticRegression(penalty=None)
model.fit(X_train, y_train)

In [118]:
model.score(X_train, y_train)

0.9802708277284549

In [119]:
model.score(X_test, y_test)

0.967984934086629

In [None]:
del former