**Note:** I tried several times to use PySpark to use `Logistic Regression` procedure, but most of times I got stuck on its processing. Therefore, I did a research and learn how to do that using SKLearn instead. Fortunately, I got better results using it rather than PySpark Framework for this purpose.

In [1]:
import pandas as pd
from pandas.core.frame import DataFrame
from pandas.core.series import Series

import string

import nltk
from nltk.corpus import stopwords

from numpy import ndarray

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle

In [2]:
nltk.download('stopwords', quiet=True)

True

In [3]:
# Number of entries per dataframe
NUMBER_ENTRIES_PER_DF = 100

In [4]:
def get_subsets(df:DataFrame, subset:str)->type(list):
    return df.drop_duplicates(subset=[subset])[subset].to_list()

def get_accuracy(s:Series, arr:ndarray)->type(int):
    return round(accuracy_score(s, arr) * 100, 2)

def get_random_df(df:DataFrame)->type(DataFrame):
    n_df = shuffle(df.reset_index(drop=True))\
        .head(NUMBER_ENTRIES_PER_DF)\
        .reset_index(drop=True)
    n_df.info()
    return n_df

In [5]:
t_df = pd.read_csv('../Datasets/True.csv')
f_df = pd.read_csv('../Datasets/Fake.csv')

In [6]:
t_df['label'] = 'Real News'
f_df['label'] = 'Fake News'

In [7]:
print("Real News - DF info:")
t_df = get_random_df(t_df)

Real News - DF info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    100 non-null    object
 1   text     100 non-null    object
 2   subject  100 non-null    object
 3   date     100 non-null    object
 4   label    100 non-null    object
dtypes: object(5)
memory usage: 4.0+ KB


In [8]:
print("Fake News - DF info:")
f_df = get_random_df(f_df)

Fake News - DF info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    100 non-null    object
 1   text     100 non-null    object
 2   subject  100 non-null    object
 3   date     100 non-null    object
 4   label    100 non-null    object
dtypes: object(5)
memory usage: 4.0+ KB


In [9]:
df = shuffle(pd\
     .concat([t_df, f_df])\
     .reset_index(drop=True))
df = df.reset_index(drop=True)
df.drop(['date'], axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    200 non-null    object
 1   text     200 non-null    object
 2   subject  200 non-null    object
 3   label    200 non-null    object
dtypes: object(4)
memory usage: 6.4+ KB


In [10]:
df['text'] = df['text']\
    .map(lambda x : x.lower()\
         .translate(str\
                .maketrans('', '', string.punctuation))
         .join([word for word in x.split() if word not in stopwords.words('english')]))

In [11]:
print("Subjects: {}".format(get_subsets(df, 'subject')))
df.groupby(['subject'])['label'].count()

Subjects: ['politics', 'News', 'politicsNews', 'worldnews', 'left-news', 'US_News', 'Government News', 'Middle-east']


subject
Government News     5
Middle-east         2
News               39
US_News             2
left-news          16
politics           36
politicsNews       46
worldnews          54
Name: label, dtype: int64

In [12]:
X_training, X_testing, y_training, y_testing = train_test_split(
    df['text'],
    df['label'],
    test_size=0.3
)

In [13]:
ml_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', LogisticRegression())
])

In [14]:
ml_model = ml_pipeline.fit(X_training, y_training)

In [15]:
ml_preds = ml_model.predict(X_testing)

In [16]:
print("Prediction accuracy: {}%".format(get_accuracy(y_testing, ml_preds)))

Prediction accuracy: 91.67%
