In [1]:
import pandas as pd
from pandas.core.frame import DataFrame
from pandas.core.series import Series

import string

import nltk
from nltk.corpus import stopwords

from numpy import ndarray

import requests as rq

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle

In [2]:
nltk.download('stopwords', quiet=True)

True

In [3]:
# Number of entries per dataframe
NUMBER_ENTRIES_PER_DF = 100

# API Key for NEWs API
API_KEY = "c1e1bd32547f418190f6bd1a5fa4748d"

# API URI
URI = "https://newsapi.org/v2/everything?apiKey={apiKey}&q=politics&sortBy=popularity".format(apiKey=API_KEY)

In [4]:
def get_subsets(df:DataFrame, subset:str)->type(list):
    return df.drop_duplicates(subset=[subset])[subset].to_list()

def get_accuracy(s:Series, arr:ndarray)->type(int):
    return round(accuracy_score(s, arr) * 100, 2)

def get_random_df(df:DataFrame)->type(DataFrame):
    n_df = shuffle(df.reset_index(drop=True))\
        .head(NUMBER_ENTRIES_PER_DF)\
        .reset_index(drop=True)
    n_df.info()
    return n_df

def get_remote_news()->type(dict):
    response = rq.get(URI)
    return response.json()

def get_dataframe_from_api_response(data:str, cols_ref:list)\
->type(DataFrame):
    news = []
    for article in data['articles']:
        news.append([
            article['title'], # title
            article['description'], # text
            'politicsNews', # subject
            article['publishedAt'], # date
            'News' # label
        ])
    return DataFrame(news, columns=cols_ref)

def categorize_news(accuracy:int)->type(str):
    return "likely " + ("true" if accuracy >= 50\
        else "false")

In [5]:
t_df = pd.read_csv('../Datasets/True.csv')

In [6]:
t_df['label'] = 'Real News'

In [7]:
print("Real News - DF info:")
t_df = get_random_df(t_df)

Real News - DF info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    100 non-null    object
 1   text     100 non-null    object
 2   subject  100 non-null    object
 3   date     100 non-null    object
 4   label    100 non-null    object
dtypes: object(5)
memory usage: 4.0+ KB


In [8]:
print(t_df['date'].min())
print(t_df['date'].max())

April 11, 2017 
September 9, 2017 


In [9]:
n_data = get_remote_news()

In [10]:
print("News - DF info:")
n_df = get_dataframe_from_api_response(n_data, t_df.columns.to_list())
n_df = get_random_df(n_df)

News - DF info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    20 non-null     object
 1   text     20 non-null     object
 2   subject  20 non-null     object
 3   date     20 non-null     object
 4   label    20 non-null     object
dtypes: object(5)
memory usage: 928.0+ bytes


In [11]:
df = shuffle(pd\
     .concat([t_df, n_df])\
     .reset_index(drop=True))
df = df.reset_index(drop=True)
df.drop(['date'], axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    120 non-null    object
 1   text     120 non-null    object
 2   subject  120 non-null    object
 3   label    120 non-null    object
dtypes: object(4)
memory usage: 3.9+ KB


In [12]:
df['text'] = df['text']\
    .map(lambda x : x.lower()\
         .translate(str\
                .maketrans('', '', string.punctuation))
         .join([word for word in x.split() if word not in stopwords.words('english')]))

In [13]:
print("Subjects: {}".format(get_subsets(df, 'subject')))
df.groupby(['subject'])['label'].count()

Subjects: ['worldnews', 'politicsNews']


subject
politicsNews    67
worldnews       53
Name: label, dtype: int64

In [14]:
X_training, X_testing, y_training, y_testing = train_test_split(
    df['text'],
    df['label']
)

In [15]:
ml_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', LogisticRegression())
])

In [16]:
ml_model = ml_pipeline.fit(X_training, y_training)

In [17]:
ml_preds = ml_model.predict(X_testing)

In [18]:
acc = get_accuracy(y_testing, ml_preds)
print("Prediction accuracy: {}% (truthfulness rating: {})".format(acc, categorize_news(acc)))

Prediction accuracy: 76.67% (truthfulness rating: likely true)
