# General Info

# Imports

In [3]:
import pandas as pd
import numpy as np
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

# EDA

In [4]:
# retrieve the data
path = r'.\data\\'

train_data = pd.read_csv(f'{path}train.csv', delimiter=",")
test_data = pd.read_csv(f'{path}test.csv', delimiter=",")

# get shapes of datasets
print(train_data.shape)
print(test_data.shape)

(20800, 5)
(5200, 4)


In [5]:
# get general overview of training data
train_data

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [10]:
# show data types and check nulls
train_data.info(verbose=True, show_counts=True)

for feature in train_data:
    null_count = train_data[feature].isna().sum()
    if null_count > 0:
        print(f'Column "{feature}" has {null_count} null values.')
if train_data.isna().sum().any() == False:
    print('No feature has any null values.')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB
Column "title" has 558 null values.
Column "author" has 1957 null values.
Column "text" has 39 null values.


# Data Preprocessing

From domain knowledge about news industry, we know that reliable articles always have transparent author(s) and are structured so they have both title (headline) and text (content). I decided to inspect labels distribution to see if missing values carry important information value.

In [11]:
# filter null rows
null_title = train_data[train_data['title'].isnull()]
null_author = train_data[train_data['author'].isnull()]
null_text = train_data[train_data['text'].isnull()]

# inspect distribution of labels in among null rows
print("Distribution of labels where title is null:")
print(null_title['label'].value_counts(normalize=True))
print("Distribution of labels where text is null:")
print(null_text['label'].value_counts(normalize=True))
print("Distribution of labels where author is null:")
print(null_author['label'].value_counts(normalize=True))


Distribution of labels where title is null:
1    1.0
Name: label, dtype: float64
Distribution of labels where text is null:
1    1.0
Name: label, dtype: float64
Distribution of labels where author is null:
1    0.986714
0    0.013286
Name: label, dtype: float64


We can see that absence of these key data indicates unreliable news. To keep this information I will impute those missing values with respective indicator.

In [12]:
# impute missing values with respective indicator
train_data['title'].fillna('missing title', inplace=True)
train_data['text'].fillna('missing text', inplace=True)
train_data['author'].fillna('missing author', inplace=True)

In [13]:
def preprocess_column(text):
    # remove punctuation and special characters
    text = re.sub(r'\W', ' ', text)
    
    # remove all single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    
    # replacing multi spaces with single ones
    text = re.sub(r'\s+', ' ', text, flags=re.I)

    # converting to lowercase
    text = text.lower()
    
    # tokenization
    tokens = nltk.word_tokenize(text)
    
    return tokens

# apply preprocessing function to all string based columns
train_data[['title', 'author', 'text']] = train_data[['title', 'author', 'text']].applymap(preprocess_column)

train_data.head()

Unnamed: 0,id,title,author,text,label
0,0,"[house, dem, aide, we, didn, even, see, comey,...","[darrell, lucus]","[house, dem, aide, we, didn, even, see, comey,...",1
1,1,"[flynn, hillary, clinton, big, woman, on, camp...","[daniel, flynn]","[ever, get, the, feeling, your, life, circles,...",0
2,2,"[why, the, truth, might, get, you, fired]","[consortiumnews, com]","[why, the, truth, might, get, you, fired, octo...",1
3,3,"[15, civilians, killed, in, single, us, airstr...","[jessica, purkiss]","[videos, 15, civilians, killed, in, single, us...",1
4,4,"[iranian, woman, jailed, for, fictional, unpub...","[howard, portnoy]","[print, an, iranian, woman, has, been, sentenc...",1


In [14]:
# downloads before first run
# nltk.download('stopwords')
# nltk.download('wordnet')

# initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def remove_stopwords_and_lemmatize(tokens):
    # remove stop words
    #tokens = [token for token in tokens if token not in stopwords.words('english')]
    
    # lemmatize the words
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

# apply the function to 'text' column
train_data['text'] = train_data['text'].apply(remove_stopwords_and_lemmatize)

print(train_data.head())

   id                                              title  \
0   0  [house, dem, aide, we, didn, even, see, comey,...   
1   1  [flynn, hillary, clinton, big, woman, on, camp...   
2   2          [why, the, truth, might, get, you, fired]   
3   3  [15, civilians, killed, in, single, us, airstr...   
4   4  [iranian, woman, jailed, for, fictional, unpub...   

                  author                                               text  \
0       [darrell, lucus]  [house, dem, aide, we, didn, even, see, comey,...   
1        [daniel, flynn]  [ever, get, the, feeling, your, life, circle, ...   
2  [consortiumnews, com]  [why, the, truth, might, get, you, fired, octo...   
3     [jessica, purkiss]  [video, 15, civilian, killed, in, single, u, a...   
4      [howard, portnoy]  [print, an, iranian, woman, ha, been, sentence...   

   label  
0      1  
1      0  
2      1  
3      1  
4      1  


## Bag of Words vectorization

In [15]:
# initialize the CountVectorizer
vectorizer_bow = CountVectorizer()

# join list of words back to strings for the the BoW vectorizer
train_data['text_bow'] = train_data['text'].apply(' '.join)

# fit and transform the 'text' column
X_bow = vectorizer_bow.fit_transform(train_data['text_bow'])


## TF-IDF vectorization

In [16]:
# initialize the TfidfVectorizer
vectorizer_tfidf = TfidfVectorizer()

# We'll use the same 'text_bow' column
X_tfidf = vectorizer_tfidf.fit_transform(train_data['text_bow'])

# Classification with ML models

In [17]:
# split into training and validation sets using TF-IDF vectorized data
X_train, X_val, y_train, y_val = train_test_split(X_tfidf, train_data['label'], test_size=0.3, random_state=42)

## Naive Bayes

In [18]:
# trying different alpha values
alphas = np.arange(0, 1, 0.1)

# function for training with different alphas
def train_and_predict(alpha):

    # initialize Multinomial Naive Bayes classifier
    nb_classifier = MultinomialNB(alpha=alpha)

    # training
    nb_classifier.fit(X_train, y_train)

   # validation
    y_pred_nb = nb_classifier.predict(X_val)

    # compute classification report
    clf_report = classification_report(y_val, y_pred_nb)

    return clf_report

# iterate over the alphas and print the corresponding report
for alpha in alphas:
    print('Alpha: ', alpha)
    print('Classification report: ', train_and_predict(alpha))
    print()

Alpha:  0.0
Classification report:                precision    recall  f1-score   support

           0       0.84      0.95      0.89      3148
           1       0.94      0.82      0.88      3092

    accuracy                           0.89      6240
   macro avg       0.89      0.89      0.89      6240
weighted avg       0.89      0.89      0.89      6240


Alpha:  0.1




Classification report:                precision    recall  f1-score   support

           0       0.85      0.97      0.91      3148
           1       0.97      0.83      0.89      3092

    accuracy                           0.90      6240
   macro avg       0.91      0.90      0.90      6240
weighted avg       0.91      0.90      0.90      6240


Alpha:  0.2
Classification report:                precision    recall  f1-score   support

           0       0.84      0.98      0.90      3148
           1       0.98      0.80      0.88      3092

    accuracy                           0.89      6240
   macro avg       0.91      0.89      0.89      6240
weighted avg       0.90      0.89      0.89      6240


Alpha:  0.30000000000000004
Classification report:                precision    recall  f1-score   support

           0       0.82      0.98      0.89      3148
           1       0.98      0.78      0.87      3092

    accuracy                           0.88      6240
   macro avg  

## Logistic Regression

In [19]:
# initialize Logistic Regression classifier
clf_lr = LogisticRegression()

# training
clf_lr.fit(X_train, y_train)

# validation
y_pred_lr = clf_lr.predict(X_val)

# classification report
print(classification_report(y_val, y_pred_lr))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94      3148
           1       0.94      0.94      0.94      3092

    accuracy                           0.94      6240
   macro avg       0.94      0.94      0.94      6240
weighted avg       0.94      0.94      0.94      6240



It's clear that the Logistic Regression model is performing better than the Naive Bayes regarding the general accuracy: It correctly identified both reliable and unreliable news articles better. Moreover, it's not just making safe predictions (which would give high precision but low recall), but it also does very well at identifying most of the actual instances of each class (high recall).

However, the results of Logistic Regression model looks suspicious and should have been inspected.