In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline

from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score

In [4]:
data = pd.read_csv('../content/news.csv')

In [5]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [6]:
data['label'].value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

In [7]:
data.count()

Unnamed: 0    6335
title         6335
text          6335
label         6335
dtype: int64

In [8]:
le = LabelEncoder()
data['label'] = le.fit_transform(data['label'])

In [9]:
train_df, test_df = train_test_split(data, test_size=2000)

In [11]:
snowball = SnowballStemmer(language="english")
russian_stop_words = stopwords.words("english")

def tokenize_sentence(sentence: str, remove_stop_words: bool = True):
    tokens = word_tokenize(sentence, language="english")
    tokens = [i for i in tokens if i not in string.punctuation]
    if remove_stop_words:
        tokens = [i for i in tokens if i not in russian_stop_words]
    tokens = [snowball.stem(i) for i in tokens]
    return tokens

In [None]:
tokenize_sentence(data.iloc[1]['text'])

['googl',
 'pinterest',
 'digg',
 'linkedin',
 'reddit',
 'stumbleupon',
 'print',
 'delici',
 'pocket',
 'tumblr',
 'there',
 'two',
 'fundament',
 'truth',
 'world',
 'paul',
 'ryan',
 'desper',
 'want',
 'presid',
 'and',
 'paul',
 'ryan',
 'never',
 'presid',
 'today',
 'prove',
 'in',
 'particular',
 'stagger',
 'exampl',
 'polit',
 'cowardic',
 'paul',
 'ryan',
 're-re-re-revers',
 'cours',
 'announc',
 'back',
 'trump',
 'train',
 'this',
 'aboutfac',
 'week',
 'ago',
 'he',
 'previous',
 'declar',
 'would',
 'support',
 'defend',
 'trump',
 'tape',
 'made',
 'public',
 'trump',
 'brag',
 'assault',
 'women',
 'sudden',
 'ryan',
 'appear',
 'pro-trump',
 'ralli',
 'bold',
 'declar',
 'alreadi',
 'sent',
 'vote',
 'make',
 'presid',
 'unit',
 'state',
 'it',
 'surreal',
 'moment',
 'the',
 'figurehead',
 'republican',
 'parti',
 'dose',
 'gasolin',
 'got',
 'stage',
 'chilli',
 'afternoon',
 'wisconsin',
 'lit',
 'match',
 'speakerryan',
 'say',
 'vote',
 'realdonaldtrump',
 '“',

In [12]:
tfidf = TfidfVectorizer(tokenizer =lambda x: tokenize_sentence(x, remove_stop_words=True))

In [13]:
features = tfidf.fit_transform(train_df['text'])

In [18]:
svc_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer=lambda x: tokenize_sentence(x, remove_stop_words=True))),
    ("model", SVC(random_state=241))
]
)
rf_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer=lambda x: tokenize_sentence(x, remove_stop_words=True))),
    ("model", RandomForestClassifier(n_estimators=500,max_depth=10,random_state=241))
]
)
gb_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer=lambda x: tokenize_sentence(x, remove_stop_words=True))),
    ("model", GradientBoostingClassifier(n_estimators=500,learning_rate=0.01,random_state=241))
]
)


In [22]:
i = 1
for model in [svc_pipeline,rf_pipeline,gb_pipeline]:
  model.fit(train_df['text'],train_df['label'])
  y_pred = model.predict(test_df['text'])
  print('Model #',i,f1_score(y_true=test_df['label'],y_pred=y_pred))
  i +=1

Model #  1 0.9356608478802991
Model #  2 0.8554973821989529
Model #  3 0.8865461847389557


In [23]:
svc_pipeline.fit(train_df['text'],train_df['label'])

Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function <lambda> at 0x7fcad4730ef0>,
                                 use_idf=True, vocabulary=None)),
                ('model',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0

In [25]:
y_pred = svc_pipeline.predict(test_df['text'])

In [27]:
precision_score(y_true=test_df['label'],y_pred=y_pred)

0.9600818833162743

In [28]:
recall_score(y_true=test_df['label'],y_pred=y_pred)

0.9124513618677043

In [29]:
f1_score(y_true=test_df['label'],y_pred=y_pred)

0.9356608478802991