1. Libraries

In [1]:
#Data Understanding and Preparation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
fake = pd.read_csv('Fake.csv')
true = pd.read_csv('True.csv')



In [3]:
len(fake)
len(true)

21417

In [4]:
fake['label'] = 1
true['label']=0

df_concated = pd.concat([fake, true])
permutation = np.random.permutation(len(df_concated))
df = df_concated.iloc[permutation]
df_concated


Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1
...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",0
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",0
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",0
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",0


In [5]:
df.duplicated().sum()

209

In [6]:
df = df.drop_duplicates()

df.shape

(44689, 5)

In [7]:
nan_count_per_column = df.isnull().sum()

print(nan_count_per_column)

title      0
text       0
subject    0
date       0
label      0
dtype: int64


In [8]:
df_no_date = df.drop(columns=['date'])

df_no_date

Unnamed: 0,title,text,subject,label
17370,HOW A SINGLE FEDERAL BUREAUCRAT OPENED THE DOO...,Meet Alice Miller who single-handedly scr*wed ...,Government News,1
18309,'It's time to talk': EU again urges dialogue i...,STRASBOURG (Reuters) - The EU executive called...,worldnews,0
6005,Trump weighs next executive orders on national...,ABOARD AIR FORCE ONE (Reuters) - U.S. Presiden...,politicsNews,0
10532,U.S. senators urge Obama to push for female U....,UNITED NATIONS/WASHINGTON (Reuters) - Seven U....,politicsNews,0
6329,Senate intelligence panel to probe Russia hacking,WASHINGTON (Reuters) - The leaders of the Sena...,politicsNews,0
...,...,...,...,...
1659,Factbox: Trump on Twitter (Sept 20) - Graham-C...,The following statements were posted to the ve...,politicsNews,0
8977,Celebrity Gun Nut Threatens To Shoot His Girl...,A man known primarily for posting videos of hi...,News,1
485,Trump Tried To Distance Himself From His ‘Sen...,"Remember the name Felix Sater, because it s go...",News,1
11056,BRAVO! TED CRUZ To Introduce Bill To Help Trum...,Now one unlikely Senator is about to put forth...,politics,1


Lets try to make text Preprocessing, where are going through these steps: 
* Lowercasing
* Removing Special Characters and Punctuation
* Tokenization
* Removing Stopwords
* Lemmatization (We'll prefer this over stemming as it's generally more effective for understanding the context of words).

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('punkt')  # for tokenization
nltk.download('stopwords')  # for stopwords
nltk.download('wordnet')  # for lemmatization



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mikkelrolf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mikkelrolf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mikkelrolf/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove special characters and punctuation
    text = ''.join(char for char in text if char.isalnum() or char.isspace())

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join words back to form the cleaned text
    return ' '.join(tokens)

def preprocess_subject(subject):
    # Convert text to lowercase
    subject = subject.lower()

    # Remove special characters and punctuation
    subject = ''.join(char for char in subject if char.isalnum() or char.isspace())

    # Tokenization
    tokens = word_tokenize(subject)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join words back to form the cleaned text
    return ' '.join(tokens)

def preprocess_title(title):
    # Convert text to lowercase
    title = title.lower()

    # Remove special characters and punctuation
    title = ''.join(char for char in title if char.isalnum() or char.isspace())

    # Tokenization
    tokens = word_tokenize(title)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join words back to form the cleaned text
    return ' '.join(tokens)

# Assuming 'df' is your DataFrame and it contains a column named 'text'
df_no_date['cleaned_text'] = df_no_date['text'].apply(preprocess_text)
df_no_date['cleaned_subject'] = df_no_date['subject'].apply(preprocess_subject)
df_no_date['cleaned_title'] = df_no_date['title'].apply(preprocess_title)


In [11]:
df_no_date.drop(columns=['title', 'subject', 'text'], inplace=True)


In [12]:
df_no_date

Unnamed: 0,label,cleaned_text,cleaned_subject,cleaned_title
17370,1,meet alice miller singlehandedly scrwed legal ...,government news,single federal bureaucrat opened door let fore...
18309,0,strasbourg reuters eu executive called wednesd...,worldnews,time talk eu urge dialogue spain
6005,0,aboard air force one reuters u president donal...,politicsnews,trump weighs next executive order national sec...
10532,0,united nationswashington reuters seven u woman...,politicsnews,u senator urge obama push female un secretaryg...
6329,0,washington reuters leader senate intelligence ...,politicsnews,senate intelligence panel probe russia hacking
...,...,...,...,...
1659,0,following statement posted verified twitter ac...,politicsnews,factbox trump twitter sept 20 grahamcassidy bi...
8977,1,man known primarily posting video making sound...,news,celebrity gun nut threatens shoot girlfriend g...
485,1,remember name felix sater going come lot rober...,news,trump tried distance senior advisor promised r...
11056,1,one unlikely senator put forth bill paving way...,politics,bravo ted cruz introduce bill help trump keep ...


In [13]:
df_no_date.to_csv('Preprocessed.csv', index=False)

In [23]:
#i got an error that indicated that i had NaN in my dataset, thats why i have the code below to identify NaN

nan_count = df_no_date['cleaned_text'].isna().sum()
print(f"Number of NaN values in 'cleaned_text': {nan_count}")

Number of NaN values in 'cleaned_text': 0


In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Load your DataFrame
news_df = pd.read_csv('Preprocessed_NoText.csv')

# Ensure all text data are strings and handle any NaN values
news_df['cleaned_text'] = news_df['cleaned_text'].astype(str).fillna('')

# Initialize vectorizers
bow_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

# Apply Bag of Words model
bow_features = bow_vectorizer.fit_transform(news_df['cleaned_text'])

# Apply TF-IDF model
tfidf_features = tfidf_vectorizer.fit_transform(news_df['cleaned_text'])

# Convert BoW features into a DataFrame
bow_df = pd.DataFrame(bow_features.toarray(), columns=bow_vectorizer.get_feature_names_out())

# Convert TF-IDF features into a DataFrame
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())



Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:

news_df = pd.concat([news_df, bow_df], axis=1)

# Save or print your DataFrames
news_df.to_csv('news.csv', index=False) 

In [6]:

news_df = pd.concat([news_df, tfidf_df, bow_df], axis=1)

# Save or print your DataFrames
news_df.to_csv('news.csv', index=False) 

: 

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Load your DataFrame
ngram_df = pd.read_csv('Preprocessed_NoText.csv')

# Initialize a CountVectorizer for generating bigrams
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))  # Set ngram_range to generate bigrams

# Apply bigrams model
bigram_features = bigram_vectorizer.fit_transform(news_df['cleaned_text'])

# Convert bigram features into a DataFrame
bigram_df = pd.DataFrame(bigram_features.toarray(), columns=bigram_vectorizer.get_feature_names_out())

# Concatenate the bigram DataFrame with the original news_df DataFrame
news_df = pd.concat([news_df, bigram_df], axis=1)

# Display the updated DataFrame with bigram features
print(news_df.head())