1. Libraries

In [1]:
#Data Understanding and Preparation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
fake = pd.read_csv('Fake.csv')
true = pd.read_csv('True.csv')



In [3]:
len(fake)

23481

In [4]:
len(true)

21417

In [5]:
fake['label'] = 1
true['label']=0

In [6]:
df_concated = pd.concat([fake, true])
permutation = np.random.permutation(len(df_concated))
df = df_concated.iloc[permutation]
df_concated


Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1
...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",0
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",0
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",0
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",0


In [7]:
df.drop(columns=['date'])

Unnamed: 0,title,text,subject,label
12727,CATHOLIC BISHOP OUTRAGED Over Hillary’s ANTI-C...,Hillary s been using churches across America a...,politics,1
2768,House panel to interview former Trump adviser,"WASHINGTON (Reuters) - Michael Caputo, who was...",politicsNews,0
11498,Uzbek leader says he will curb power of state ...,TASHKENT (Reuters) - Uzbek president Shavkat M...,worldnews,0
9220,Three teenagers arrested over assaults during ...,(Reuters) - Three teenagers were arrested on W...,politicsNews,0
5049,WATCH Trump Fans B*tch And Moan About How Har...,Life is hard. We get it. Every one of us has h...,News,1
...,...,...,...,...
14061,HELPING HILLARY: What The Virginia Governor Ju...,Felons get to vote all of a sudden in Virginia...,politics,1
5929,Meg Whitman Compares Trump To Two Of The Worl...,"The GOP implosion continues, and this time Meg...",News,1
19925,Britain raises security threat level to critic...,LONDON (Reuters) - Britain lifted its national...,worldnews,0
291,Factbox: Corporate alternative minimum tax thr...,(Reuters) - The tax overhaul legislation passe...,politicsNews,0


In [8]:
df.duplicated().sum()

209

In [9]:
df = df.drop_duplicates()

df.shape

(44689, 5)

In [10]:
nan_count_per_column = df.isnull().sum()

print(nan_count_per_column)

title      0
text       0
subject    0
date       0
label      0
dtype: int64


In [11]:
df_no_date = df.drop(columns=['date'])

# Save the modified DataFrame to a new CSV file
df_no_date.to_csv('NoDateDataSet.csv', index=False)

In [12]:
df_no_date

Unnamed: 0,title,text,subject,label
12727,CATHOLIC BISHOP OUTRAGED Over Hillary’s ANTI-C...,Hillary s been using churches across America a...,politics,1
2768,House panel to interview former Trump adviser,"WASHINGTON (Reuters) - Michael Caputo, who was...",politicsNews,0
11498,Uzbek leader says he will curb power of state ...,TASHKENT (Reuters) - Uzbek president Shavkat M...,worldnews,0
9220,Three teenagers arrested over assaults during ...,(Reuters) - Three teenagers were arrested on W...,politicsNews,0
5049,WATCH Trump Fans B*tch And Moan About How Har...,Life is hard. We get it. Every one of us has h...,News,1
...,...,...,...,...
14061,HELPING HILLARY: What The Virginia Governor Ju...,Felons get to vote all of a sudden in Virginia...,politics,1
5929,Meg Whitman Compares Trump To Two Of The Worl...,"The GOP implosion continues, and this time Meg...",News,1
19925,Britain raises security threat level to critic...,LONDON (Reuters) - Britain lifted its national...,worldnews,0
291,Factbox: Corporate alternative minimum tax thr...,(Reuters) - The tax overhaul legislation passe...,politicsNews,0


Lets try to make text Preprocessing, where are going through these steps: 
* Lowercasing
* Removing Special Characters and Punctuation
* Tokenization
* Removing Stopwords
* Lemmatization (We'll prefer this over stemming as it's generally more effective for understanding the context of words).

In [17]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('punkt')  # for tokenization
nltk.download('stopwords')  # for stopwords
nltk.download('wordnet')  # for lemmatization



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mikkelrolf/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mikkelrolf/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mikkelrolf/nltk_data...


True

In [18]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove special characters and punctuation
    text = ''.join(char for char in text if char.isalnum() or char.isspace())

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join words back to form the cleaned text
    return ' '.join(tokens)


# Assuming 'df' is your DataFrame and it contains a column named 'text'
df_no_date['cleaned_text'] = df_no_date['text'].apply(preprocess_text)

df_no_date.to_csv('Preprocessed.csv', index=False)

In [19]:
df_no_date

Unnamed: 0,title,text,subject,label,cleaned_text
12727,CATHOLIC BISHOP OUTRAGED Over Hillary’s ANTI-C...,Hillary s been using churches across America a...,politics,1,hillary using church across america backdrop p...
2768,House panel to interview former Trump adviser,"WASHINGTON (Reuters) - Michael Caputo, who was...",politicsNews,0,washington reuters michael caputo adviser pres...
11498,Uzbek leader says he will curb power of state ...,TASHKENT (Reuters) - Uzbek president Shavkat M...,worldnews,0,tashkent reuters uzbek president shavkat mirzi...
9220,Three teenagers arrested over assaults during ...,(Reuters) - Three teenagers were arrested on W...,politicsNews,0,reuters three teenager arrested wednesday accu...
5049,WATCH Trump Fans B*tch And Moan About How Har...,Life is hard. We get it. Every one of us has h...,News,1,life hard get every one u tough time one reaso...
...,...,...,...,...,...
14061,HELPING HILLARY: What The Virginia Governor Ju...,Felons get to vote all of a sudden in Virginia...,politics,1,felon get vote sudden virginia governor buddy ...
5929,Meg Whitman Compares Trump To Two Of The Worl...,"The GOP implosion continues, and this time Meg...",News,1,gop implosion continues time meg whitman faile...
19925,Britain raises security threat level to critic...,LONDON (Reuters) - Britain lifted its national...,worldnews,0,london reuters britain lifted national securit...
291,Factbox: Corporate alternative minimum tax thr...,(Reuters) - The tax overhaul legislation passe...,politicsNews,0,reuters tax overhaul legislation passed u sena...


In [20]:
df_no_date.to_csv('Preprocessed_NoText.csv', index=False)

In [21]:
#i got an error that indicated that i had NaN in my dataset, thats why i have the code below to identify NaN

nan_count = df_no_date['cleaned_text'].isna().sum()
print(f"Number of NaN values in 'cleaned_text': {nan_count}")

Number of NaN values in 'cleaned_text': 0


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Load your DataFrame
news_df = pd.read_csv('Preprocessed_NoText.csv')

# Ensure all text data are strings and handle any NaN values
news_df['cleaned_text'] = news_df['cleaned_text'].astype(str).fillna('')

# Initialize vectorizers
bow_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

# Apply Bag of Words model
bow_features = bow_vectorizer.fit_transform(news_df['cleaned_text'])

# Apply TF-IDF model
tfidf_features = tfidf_vectorizer.fit_transform(news_df['cleaned_text'])

# Convert BoW features into a DataFrame
bow_df = pd.DataFrame(bow_features.toarray(), columns=bow_vectorizer.get_feature_names_out())
news_df = pd.concat([news_df, bow_df], axis=1)

# Convert TF-IDF features into a DataFrame
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
news_df = pd.concat([news_df, tfidf_df], axis=1)



# Save or print your DataFrames
print(bow_df.head())
print(tfidf_df.head())


: 

In [3]:
news_df


Unnamed: 0,title,text,subject,label,cleaned_text
0,CATHOLIC BISHOP OUTRAGED Over Hillary’s ANTI-C...,Hillary s been using churches across America a...,politics,1,hillary using church across america backdrop p...
1,House panel to interview former Trump adviser,"WASHINGTON (Reuters) - Michael Caputo, who was...",politicsNews,0,washington reuters michael caputo adviser pres...
2,Uzbek leader says he will curb power of state ...,TASHKENT (Reuters) - Uzbek president Shavkat M...,worldnews,0,tashkent reuters uzbek president shavkat mirzi...
3,Three teenagers arrested over assaults during ...,(Reuters) - Three teenagers were arrested on W...,politicsNews,0,reuters three teenager arrested wednesday accu...
4,WATCH Trump Fans B*tch And Moan About How Har...,Life is hard. We get it. Every one of us has h...,News,1,life hard get every one u tough time one reaso...
...,...,...,...,...,...
44684,HELPING HILLARY: What The Virginia Governor Ju...,Felons get to vote all of a sudden in Virginia...,politics,1,felon get vote sudden virginia governor buddy ...
44685,Meg Whitman Compares Trump To Two Of The Worl...,"The GOP implosion continues, and this time Meg...",News,1,gop implosion continues time meg whitman faile...
44686,Britain raises security threat level to critic...,LONDON (Reuters) - Britain lifted its national...,worldnews,0,london reuters britain lifted national securit...
44687,Factbox: Corporate alternative minimum tax thr...,(Reuters) - The tax overhaul legislation passe...,politicsNews,0,reuters tax overhaul legislation passed u sena...
