In [None]:
!pip install kaggle pandas
!pip install opendatasets
!pip install wget
!pip install kaggle

In [None]:
import numpy as np
from numpy.typing import NDArray
import pandas as pd
from IPython.display import display
import spacy
from scipy.sparse import issparse
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from typing import Text,List,Iterator,Iterable,Union, Tuple
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
nltk.download("stopwords")
nltk.download("punkt")
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')
import string
import warnings
warnings.filterwarnings("ignore")
import nltk
nltk.download('punkt_tab')
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from itertools import count
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
#from gensim import corpora
from sklearn.metrics import classification_report
import random
import opendatasets as od
import kagglehub
from kagglehub import KaggleDatasetAdapter

Reading data and checking the data balance and NULL values

In [82]:
#Reading data
""" This is a publicly available fake and real news dataset on Kaggle. To download the dataset two options are commented on here.
In here, I have used the open datasets(od) to download the data from the Kaggle.
 However, the second option might also work if anyone chooses to apply.
 As a disclaimer I would like to explicitly state that I have no explicit license over this data, and it is only intended for educational purposes not for any distribution.
"""
#od.download("https://www.kaggle.com/datasets/nopdev/real-and-fake-news-dataset")
file_path = "/content/real-and-fake-news-dataset/news.csv"
#df = kagglehub.load_dataset(KaggleDatasetAdapter.PANDAS,"nopdev/real-and-fake-news-dataset",path=file_path)

#df = ("https://www.kaggle.com/datasets/nopdev/real-and-fake-news-dataset")
data = pd.read_csv(file_path)

In [None]:
#visualizing the data
data.head()


In [None]:
#Checking the data values
data.info()

In [None]:
#Checking the balance of real and fake news data using a bar chart
data["label"].value_counts().plot(kind="bar",figsize=(5,8),color=["blue","black"],title="posts related to fake news")

POStags and NERtags Processing and visualizations

In [None]:
"""Run the code sequentially to avoid unnecessary errors as the previous assists the next flow.However,fake_spacydocs and
real_spacydocs takes a while probably more than 5 minutes. Hence, avoid repeated running of this section of the code."""

In [69]:
#initalize spacymodel
nlp = spacy.load("en_core_web_sm")

In [70]:
#Spliting the data in to fake and real news
Fake_news = data[data["label"] == "FAKE"]
Real_news = data[data["label"] == "REAL"]

In [71]:
#Creating spacy documents  for each group of news
fake_spacydocs = list(nlp.pipe(Fake_news["text"]))
real_spacydocs = list(nlp.pipe(Real_news["text"]))

Extracting Fake news and Real news Postags and NER tags

In [72]:
#A function to extract POStags and NER tags from each of the rows of the document using spacy
def extract_token_tags(doc:spacy.tokens.doc.Doc) -> List[Tuple[Text,Text]]:
  return [(token.text,token.tag_,token.pos_) for token in doc]

Extracting Fake news POStags and NER tags

In [None]:
def fakenews_tags_process(fake_spacydocs):

     fake_tagsdf = []
     columns = ["Token","NER_tag","POS_tag"]
     for i, doc in enumerate(fake_spacydocs):
         tags = extract_token_tags(doc)
         tags = pd.DataFrame(tags,columns=columns)
         fake_tagsdf.append(tags)
     ###Concatenating the tags in the panda dataframe to the right format
     fake_tagssdf = pd.concat(fake_tagsdf)
     #Frequency of fake news POS_tags
     Fake_postag_counts =  fake_tagssdf.groupby(["Token", "POS_tag"]).size().reset_index(name="count").sort_values(by="count",ascending=False)[:20]
     display(Fake_postag_counts.head())
     #Frequency of fake news entities
     Fake_entities=  fake_tagssdf.groupby(["Token", "NER_tag"]).size().reset_index(name="count").sort_values(by="count",ascending=False)[:20]
     display(Fake_entities.head())
     #Graphical visualizations of Fake news NER_tags
     purple_palette = ['#7c3f00','#633200','#562b00']
     sns.barplot(
       x = "count",
       y = "Token",
       hue = "NER_tag",
       data = Fake_entities,
       palette=purple_palette,
       dodge = False
      ).set(title = "Most common Named Entities in Fake News data")
     plt.show()
    #calling the function for visualization
fakenews_tags_process(fake_spacydocs)

Extracting Real news Postags and NER tags

In [None]:
def realnews_tags_process(real_spacydocs):
   columns = ["Token","NER_tag","POS_tag"]
   real_tagsdf = []

   for i, doc in enumerate(real_spacydocs):
      tags = extract_token_tags(doc)
      tags = pd.DataFrame(tags,columns=columns)
      real_tagsdf.append(tags)
   #Concatenating the tags in the panda dataframe to the right format
   real_tagssdf = pd.concat(real_tagsdf)
   #Frequencies of real news POS_tags
   Real_postag_counts = real_tagssdf.groupby(["Token", "POS_tag"]).size().reset_index(name="count").sort_values(by="count",ascending=False)[:20]
   display(Real_postag_counts.head())
   #Frequency of real news entities
   Real_entities = real_tagssdf.groupby(["Token", "NER_tag"]).size().reset_index(name="count").sort_values(by="count",ascending=False)[:20]
   display(Real_entities.head())
   #Graphical visualizations of Real news NER_tags
   purple_palette = ['#9F2B68', '#BF40BF', '#CF9FFF']
   sns.barplot(
       x = "count",
       y = "Token",
       hue = "NER_tag",
       data = Real_entities,
       palette=purple_palette,
       dodge = False
   ).set(title = "Most common Named Entities in Real News data")
   plt.show()
   #calling the function for visualization
realnews_tags_process(real_spacydocs)

Text preprocessing

In [None]:
"""This preprocess_data function plays major role in preprocessing the data extracted. It includes a helper text_clean function
 that is a template function that tells how each row in the text column of the data be cleaned.
 In this helper function regular expression, wordnet lemmatizer and english stop words were used to clean text.Then this text cleaning function is applied to the data."""

In [75]:
def preprocess_data(data: pd.DataFrame,text_column: str):
    en_stopwords = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()
    #data["cleaned_text"] = data.apply(lambda row_txt: re.sub(r"^[^-]*[^-]*-\s*", "", str(row_txt["text"])), axis= 1)
    def text_clean(text):

       #Removing consequetive hyphens and white spaces from the start of a token
       text = re.sub(r'[^a-zA-Z\s]', '', text)
       text = re.sub(r"^[^-]*[^-]*-\s*", "", text)
       #Removing leading white spaces
       text = re.sub(r"(^\s+|\s+$)", " ", text)
       #Lower casing characters with regular expression
       text = text.lower()
       #Removing spaces, punctuation, digits, etc... using regular expression
       text = re.sub(r"([^\w\s])", " ",text)
       text = re.sub(r"([.,;!?])", " ", text)
       text = re.sub(r"(\d+)", " ", text)
       text = re.sub(r"(\n{2,})", " ",text)
       text = re.sub(r"(\s{2,})", " ",text)
       #Tokenizing the cleaned text
       Tokens = word_tokenize(text)
       #Removing stop words from the cleaned documents
       text = [Token for Token in Tokens if Token not in (en_stopwords)]
       #Applying lemmatization on the cleaned text
       Lemmatized_text = ' '.join(lemmatizer.lemmatize(Token) for Token in Tokens)
       return Lemmatized_text
    cleaned_texts = data["text"].apply(text_clean)
    data["cleaned_text"] = cleaned_texts
    return data["cleaned_text"]

Sentiment Analysis using vader sentiment analyzer which is mostly applied to social media data sentiment analysis

In [None]:
"""This sentiment analysis class was required mainly to separate the data input of this sentiment anlayzer from the classifier.
The sentiment analyzer accepts uncleaned data.Beacuse the purpose of this class preparation is mainly for isolation purpose
bins and labels are set at the inializations, it is not possible to pass the bin values while running unless the user sets
another bin values or edit the class code."""

In [76]:
class SentimentAnalyzer:
    def __init__(self, data: pd.DataFrame):
        """Initializes the SentimentAnalyzer with a DataFrame."""
        self.data = data
        self.sentiment_results = None
        self.custom_plot_colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
        self.bins = [-1, -0.1,0.1, 1]
        self.labels = ["Negative", "Neutral", "Positive"]
        self.vader_sentiment = SentimentIntensityAnalyzer()    # Instantiate VADER
    def analyze_sentiment(self):
        """Analyzes sentiment of the text data."""
        self.data["sentiment_score"] = self.data["text"].map(lambda x: self.vader_sentiment.polarity_scores(x)["compound"])
        self.data.loc[:,"vader_sentiment_label"] = pd.cut(self.data["sentiment_score"], bins=self.bins, labels=self.labels)
        self.sentiment_results = self.data[['text', 'sentiment_score', 'vader_sentiment_label']]
        self.data.head()
        return self.sentiment_results
    def sentiment_distribution_visualizer(self):
        self.custom_plot_colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
        sns.countplot(x="vader_sentiment_label", data=self.data, palette=self.custom_plot_colors).set(title="Sentiment Distribution")
        plt.show()
    def sentiment_valuecount_visualizer(self):
        self.custom_plot_colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
        self.data['vader_sentiment_label'].value_counts().plot(kind='bar', color=self.custom_plot_colors)
        plt.show()


Text classifier using TFIDF vectorizer and Logistic regression

In [None]:
"""This text classifier takes cleaned text from the preprocess_data function cleaned using regular expression and lemmatized using lematizer.
Hence,before running this class make sure to run the preprocess_data function above."""

In [77]:
class TextClassifier:
    def __init__(self):
        """Initializes the TextClassifier with a cleaned text DataFrame and Label column."""
        self.le = LabelEncoder()
        self.vectorizer = TfidfVectorizer(ngram_range=(1, 3),max_df=0.9,min_df=5,
        stop_words=None,
        sublinear_tf=True)
        self.model = LogisticRegression(solver='liblinear',class_weight='balanced',random_state=42)
        self.y = [] # label column from the data input
        self.X = [] # cleaned text
    def train_classifier(self, data: pd.DataFrame, label: str):
        """Trains a classifier on the cleaned text data."""
        self.X = data
        self.y = label
        self.y = self.le.fit_transform(self.y)
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42)
        Xj_train = [' '.join(tokens) for tokens in X_train]
        Xj_test = [' '.join(tokens) for tokens in X_test]
        X_tvec = self.vectorizer.fit_transform(X_train)
        X_ttest = self.vectorizer.transform(X_test)
        self.model.fit(X_tvec, y_train)
        # predict based on the segregated test data
        y_pred = self.model.predict(X_ttest)
        assert y_test.shape == y_pred.shape
        #accuracy_score from the model
        accuracy = self.model.score(X_ttest, y_test)
        F1_score = f1_score(y_test, y_pred)
        print(f'Model accuracy: {accuracy:.2f}')
        print(classification_report(y_test, y_pred))


In [None]:
"""Incase if you experience any variations in classification results it might be helpful to apply what is commented here.
However, for this dataset it is unlikely for that to happen."""
#random.seed(0)
#np.random.seed(0)

Applying the sentiment analyzer and Text classifier on the fake news data

In [None]:
if __name__ == "__main__":

    #Reading data
    # This is a publicly available fake and real news dataset on Kaggle
    file_path = "/content/real-and-fake-news-dataset/news.csv"
    data = pd.read_csv(file_path)

    # This is the label column labeled as label in the data file.
    label_column = data["label"]
    # Sentiment Analysis
    sentiment_analyzer = SentimentAnalyzer(data)

    # Analyze entire DataFrame
    sentiment_results = sentiment_analyzer.analyze_sentiment()
    print("Sentiment Analysis Results")
    print(sentiment_results)

    # Plot sentiment value counts and distribution
    sentiment_analyzer.sentiment_valuecount_visualizer()
    sentiment_analyzer.sentiment_distribution_visualizer()
     # Preprocessing the text from the data
    cleaned_text_col = preprocess_data(data,"text")

     ###Classifier
    fakenews_classifier = TextClassifier()
    fakenews_classifier.train_classifier(cleaned_text_col,label_column)