This notebook roughly follows [Detecting opinion spams and fake news ... (2017)](https://www.researchgate.net/publication/322128415_Detecting_opinion_spams_and_fake_news_using_text_classification) by copy-pasting some code from CodeBasic's [videos](https://www.youtube.com/playlist?list=PLeo1K3hjS3uuvuAXhYjV2lMEShq2UYSwX) and [notebooks](https://github.com/codebasics/nlp-tutorials/blob/main/11_bag_of_n_grams/10_bag_of_n_grams.ipynb).

<div style="color:red;">Please download the Kaggle dataset and LIAR dataset first.</div>

# Read in datasets

In [3]:
import pandas as pd

# Read in LIAR dataset columns 1 and 2 only:
liar_df = pd.read_csv('./Datasets/liar_dataset/train.tsv', sep='\t',header=None, usecols = [1,2], names=["Label", "Statement"])
liar_df.head()
liar_df['Truth'] = liar_df.Label.map({
    'pants-fire': 0,
    'false': 0,
    'mostly-false': 0,
    'half-true': 0,
    'barely-true': 1,
    'mostly-true': 1,
    'true': 1
})

# Read in Kaggle titles only. The dataframe df_kaggle has two columns: article titles, and (true/false).
kaggle_df_true = pd.read_csv('./kaggle_dataset/True.csv', usecols = [0])
kaggle_df_fake = pd.read_csv('./kaggle_dataset/Fake.csv', usecols = [0])
kaggle_df_true['Truth'] = 1
kaggle_df_fake['Truth'] = 0
kaggle_df = pd.concat([kaggle_df_true, kaggle_df_fake], ignore_index=True, sort=False)

FileNotFoundError: [Errno 2] No such file or directory: './kaggle_dataset/True.csv'

# Preprocessing

<div style="color:red;">You may need to install</div> [`spacy`](https://spacy.io/usage/). For instance, in `conda powershell`, type the following two commands:
```
conda install -c conda-forge spacy
python -m spacy download en_core_web_sm
```

In [55]:
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens)

In [56]:
# preprocess kaggle_corpus
kaggle_corpus = kaggle_df.loc[:,'title'].tolist()
kaggle_corpus_processed = [preprocess(text) for text in kaggle_corpus]

In [64]:
# preprocess liar_corpus
liar_corpus = liar_df.loc[:,'Statement'].tolist()
liar_corpus_processed = [preprocess(text) for text in liar_corpus]

# Train-test split

In [65]:
from sklearn.model_selection import train_test_split

kaggle_X_train, kaggle_X_test, kaggle_y_train, kaggle_y_test = train_test_split(
    kaggle_corpus_processed, 
    kaggle_df.Truth, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=kaggle_df.Truth
)

liar_X_train, liar_X_test, liar_y_train, liar_y_test = train_test_split(
    liar_corpus_processed, 
    liar_df.Truth, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=None
)

# Classification training and report

In [74]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

## Kaggle dataset, $n$-gram ($n=1$), Naive Bayes (NB) classifier

In [67]:
#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 1))),  #using the ngram_range parameter 
     ('Multi NB', MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(kaggle_X_train, kaggle_y_train)

#3. get the predictions for X_test and store it in y_pred
kaggle_y_pred = clf.predict(kaggle_X_test)

#4. print the classfication report
print(classification_report(kaggle_y_test, kaggle_y_pred))

              precision    recall  f1-score   support

           0       0.94      0.95      0.95      4696
           1       0.95      0.93      0.94      4284

    accuracy                           0.94      8980
   macro avg       0.94      0.94      0.94      8980
weighted avg       0.94      0.94      0.94      8980



## LIAR dataset, $n$-gram ($n=1$), Naive Bayes (NB) classifier

In [72]:
#2. fit with X_train and y_train
clf.fit(liar_X_train, liar_y_train)

#3. get the predictions for X_test and store it in y_pred
liar_y_pred = clf.predict(liar_X_test)

#4. print the classfication report
print(classification_report(liar_y_test, liar_y_pred))

              precision    recall  f1-score   support

           0       0.55      0.47      0.51       990
           1       0.57      0.64      0.60      1058

    accuracy                           0.56      2048
   macro avg       0.56      0.56      0.56      2048
weighted avg       0.56      0.56      0.56      2048



## Kaggle dataset, $n$-gram ($n=3$), Naive Bayes (NB) classifier

In [70]:
#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 3))),  #using the ngram_range parameter 
     ('Multi NB', MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(kaggle_X_train, kaggle_y_train)

#3. get the predictions for X_test and store it in y_pred
kaggle_y_pred = clf.predict(kaggle_X_test)

#4. print the classfication report
print(classification_report(kaggle_y_test, kaggle_y_pred))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95      4696
           1       0.97      0.92      0.95      4284

    accuracy                           0.95      8980
   macro avg       0.95      0.95      0.95      8980
weighted avg       0.95      0.95      0.95      8980



## Kaggle dataset, TD-IDF, $k$-NN classifier

In [69]:
#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('KNN', KNeighborsClassifier())         
])

#2. fit with X_train and y_train
clf.fit(kaggle_X_train, kaggle_y_train)

#3. get the predictions for X_test and store it in y_pred
kaggle_y_pred = clf.predict(kaggle_X_test)

#4. print the classfication report
print(classification_report(kaggle_y_test, kaggle_y_pred))

              precision    recall  f1-score   support

           0       0.93      0.86      0.89      4696
           1       0.86      0.93      0.89      4284

    accuracy                           0.89      8980
   macro avg       0.89      0.89      0.89      8980
weighted avg       0.90      0.89      0.89      8980



## LIAR dataset, TD-IDF, $k$-NN classifier

In [71]:
#2. fit with X_train and y_train
clf.fit(liar_X_train, liar_y_train)


#3. get the predictions for X_test and store it in y_pred
liar_y_pred = clf.predict(liar_X_test)


#4. print the classfication report
print(classification_report(liar_y_test, liar_y_pred))

              precision    recall  f1-score   support

           0       0.55      0.47      0.51       990
           1       0.57      0.64      0.60      1058

    accuracy                           0.56      2048
   macro avg       0.56      0.56      0.56      2048
weighted avg       0.56      0.56      0.56      2048



## Kaggle dataset, TD-IDF, Random forest

In [76]:
#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),        #using the ngram_range parameter 
     ('Random Forest', RandomForestClassifier())         
])

#2. fit with X_train and y_train
clf.fit(kaggle_X_train, kaggle_y_train)


#3. get the predictions for X_test and store it in y_pred
kaggle_y_pred = clf.predict(kaggle_X_test)


#4. print the classfication report
print(classification_report(kaggle_y_test, kaggle_y_pred))

              precision    recall  f1-score   support

           0       0.95      0.93      0.94      4696
           1       0.93      0.95      0.94      4284

    accuracy                           0.94      8980
   macro avg       0.94      0.94      0.94      8980
weighted avg       0.94      0.94      0.94      8980



## LIAR dataset, TD-IDF, Random forest

In [75]:
#2. fit with X_train and y_train
clf.fit(liar_X_train, liar_y_train)


#3. get the predictions for X_test and store it in y_pred
liar_y_pred = clf.predict(liar_X_test)


#4. print the classfication report
print(classification_report(liar_y_test, liar_y_pred))

              precision    recall  f1-score   support

           0       0.55      0.51      0.53       990
           1       0.57      0.61      0.59      1058

    accuracy                           0.56      2048
   macro avg       0.56      0.56      0.56      2048
weighted avg       0.56      0.56      0.56      2048



You can also experiment with what happens if we run this without preprocessing the text, or if we preprocess but do not remove stop words.