In [108]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string

## Dataset

There are 2 datasets. Fake news and Not fake news

In [109]:
data_fake = pd.read_csv('Dataset\Fake.csv')
data_true = pd.read_csv('Dataset\True.csv')

In [110]:
data_fake.head(2)

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"


In [111]:
data_true.head(2)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"


## Data Pre processing

Insert new column 'class' as the target feature

In [112]:
data_fake['class'] = 0
data_true['class'] = 1

In [113]:
data_fake.head(1)

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0


In [114]:
data_true.head(1)

Unnamed: 0,title,text,subject,date,class
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1


In [115]:
data_fake.shape, data_true.shape

((23481, 5), (21417, 5))

In [116]:
data_fake.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
 4   class    23481 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 917.4+ KB


In [117]:
data_true.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
 4   class    21417 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 836.7+ KB


## Merge dataset

In [118]:
news_df = pd.concat([data_fake, data_true], axis = 0)   

#axis=0 or 'index' -> vertically
#axis=1 or 'column' -> horizontally
news_df.head(2)

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0


In [119]:
news_df.tail(2)

Unnamed: 0,title,text,subject,date,class
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1
21416,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017",1


In [120]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44898 entries, 0 to 21416
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   class    44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.1+ MB


## Remove irrelavent columns

In [121]:
news_df = news_df.drop(['title', 'subject', 'date'], axis = 1)

In [122]:
# check null values
news_df.isnull().sum()

text     0
class    0
dtype: int64

In [123]:
news_df.head()

Unnamed: 0,text,class
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0


In [124]:
# shuffle df
news_df = news_df.sample(frac=1, random_state=42)

# frac - fraction of rows to return when sampling
# random_state - reproductability 

In [125]:
news_df.head(3)

Unnamed: 0,text,class
22216,"21st Century Wire says Ben Stein, reputable pr...",0
4436,WASHINGTON (Reuters) - U.S. President Donald T...,1
1526,(Reuters) - Puerto Rico Governor Ricardo Rosse...,1


In [126]:
news_df.columns

Index(['text', 'class'], dtype='object')

In [127]:
# reset index
news_df.reset_index(inplace=True)

In [128]:
news_df.head(3)

Unnamed: 0,index,text,class
0,22216,"21st Century Wire says Ben Stein, reputable pr...",0
1,4436,WASHINGTON (Reuters) - U.S. President Donald T...,1
2,1526,(Reuters) - Puerto Rico Governor Ricardo Rosse...,1


In [129]:
# drop index column
news_df.drop(['index'], axis=1, inplace=True)

In [130]:
news_df.head(3)

Unnamed: 0,text,class
0,"21st Century Wire says Ben Stein, reputable pr...",0
1,WASHINGTON (Reuters) - U.S. President Donald T...,1
2,(Reuters) - Puerto Rico Governor Ricardo Rosse...,1


## Text preprocessing

text = text.lower(): This line converts all text to lowercase, which can be beneficial for standardizing the text data.

text = re.sub(r'\[.*?\]', '', text): This line uses a regular expression (re.sub) to remove text within square brackets ([...]) and the brackets themselves. The .*? pattern matches any characters (except newline) inside the square brackets in a non-greedy way.

text = re.sub(r'https?://\S+|www\.\S+', '', text): This line uses a regular expression to remove URLs starting with http:// or https:// and URLs starting with www.. \S+ matches one or more non-whitespace characters.

text = re.sub(r'<.*?>+', '', text): This line uses a regular expression to remove HTML tags. <.*?>+ matches any HTML tags (including their attributes) and removes them.

text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text): This line uses a regular expression to remove punctuation marks. string.punctuation contains a string of all punctuation characters. re.escape is used to escape these characters in the regular expression pattern.

text = re.sub(r'\n', '', text): This line removes newline characters (\n) from the text.

text = re.sub(r'\w*\d\w*', '', text): This line uses a regular expression to remove words containing digits. \w*\d\w* matches words that contain at least one digit.

In [131]:
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'\[.*?\]', '', text)  # Remove text within square brackets and the brackets themselves
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs starting with http/https or www
    text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation marks
    text = re.sub(r'\n', '', text)  # Remove newline characters
    text = re.sub(r'\w*\d\w*', '', text)  # Remove words containing digits
    return text

In [132]:
# apply function to the dataset 
news_df['text'] = news_df['text'].apply(clean_text)

## Vectorize text data 

Vectorizing text data refers to the process of converting textual information into numerical vectors that machine learning algorithms can understand and process

X - input data, features or independent variables
y - output data, target variable, response variable, or dependent variable 

In [133]:
X = news_df['text']
y = news_df['class']

perform TF-IDF vectorization

In [134]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X)

## Model building

Split dataset into Train and Test

In [135]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

print("Training data shape:", X_train.shape, y_train.shape)
print("Testing data shape:", X_test.shape, y_test.shape)

Training data shape: (33673,) (33673,)
Testing data shape: (11225,) (11225,)


### Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression