In [14]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [15]:
# Read the dataset from the specified path
df = pd.read_csv('/content/fake_real.csv', sep=',', encoding='utf-8', quotechar='"')


In [16]:
# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,title,text,subject,date,type
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",True
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",True
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",True
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",True
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",True


In [17]:
# Drop all columns except 'text' and 'type'
df = df[['text', 'type']]

# Verify the resulting DataFrame
df.head()

Unnamed: 0,text,type
0,WASHINGTON (Reuters) - The head of a conservat...,True
1,WASHINGTON (Reuters) - Transgender people will...,True
2,WASHINGTON (Reuters) - The special counsel inv...,True
3,WASHINGTON (Reuters) - Trump campaign adviser ...,True
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,True


Gensim and spaCy are primarily NLP libraries that focus on natural language processing tasks such as text preprocessing, feature extraction, and linguistic analysis.

1. **In general**, while **spaCy** may have slightly longer processing times then **nltk** due to its more comprehensive functionality and language model loading
2. If we prioritize speed, efficiency, and accuracy in text processing, spaCy may be the better choice.
3. However, if we require more flexibility, customization, and a wide range of NLP tools, **NLTK** might be a better fit.

**Parallelization**

In [None]:
import spacy
import re
import string

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    # Function to remove noise from text
    def remove_noise(text):
        # Remove punctuation marks, special characters, and digits in one pass
        cleaned_text = re.sub(r'[^\w\s]|[\d]', '', text)
        return cleaned_text

    # Function to normalize text (handle contractions, convert to lowercase)
    def normalize_text(text):
        # Convert text to lowercase
        text = text.lower()
        # Handle contractions (e.g., "can't" -> "can not")
        text = re.sub(r"n't", " not", text)
        # Add more contraction handling patterns as needed
        return text

    # Function to tokenize the text using spaCy
    def tokenize_text(text):
        doc = nlp(text)
        tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_space and not token.is_digit]
        return tokens

    # Function to remove stopwords from tokens
    def remove_stopwords(tokens):
        # Define stop words
        stop_words = set(nlp.Defaults.stop_words)
        return [token for token in tokens if token not in stop_words]

    # Function to join tokens back into a single string
    def join_tokens(tokens):
        return ' '.join(tokens)

    # Remove noise from the text
    text = remove_noise(text)
    # Normalize the text (handle contractions, convert to lowercase)
    text = normalize_text(text)
    # Tokenize the text using spaCy and remove punctuation marks, spaces, and digits
    tokens = tokenize_text(text)
    # Remove stopwords from tokens
    tokens = remove_stopwords(tokens)
    # Join tokens back into a single string
    processed_text = join_tokens(tokens)

    return processed_text

# Apply the preprocess_text function to the 'text' column
df['processed_Text'] = df['text'].apply(preprocess_text)

# Display the DataFrame after text processing
df.head()

**Batch Processing**

In [18]:
import spacy
import re
import pandas as pd

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to remove noise from text
def remove_noise(text):
    # Remove punctuation marks, special characters, and digits in one pass
    cleaned_text = re.sub(r'[^\w\s]|[\d]', '', text)
    return cleaned_text

# Function to normalize text (handle contractions, convert to lowercase)
def normalize_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Handle contractions (e.g., "can't" -> "can not")
    text = re.sub(r"n't", " not", text)
    # Add more contraction handling patterns as needed
    return text

# Function to tokenize the text using spaCy
def tokenize_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_space and not token.is_digit]
    return tokens

# Function to remove stopwords from tokens
def remove_stopwords(tokens):
    # Define stop words
    stop_words = set(nlp.Defaults.stop_words)
    return [token for token in tokens if token not in stop_words]

# Function to join tokens back into a single string
def join_tokens(tokens):
    return ' '.join(tokens)

# Function for batch processing
def preprocess_batch(texts):
    processed_texts = []
    for text in texts:
        # Remove noise from the text
        text = remove_noise(text)
        # Normalize the text (handle contractions, convert to lowercase)
        text = normalize_text(text)
        # Tokenize the text using spaCy and remove punctuation marks, spaces, and digits
        tokens = tokenize_text(text)
        # Remove stopwords from tokens
        tokens = remove_stopwords(tokens)
        # Join tokens back into a single string
        processed_text = join_tokens(tokens)
        processed_texts.append(processed_text)
    return processed_texts

# Read the dataset from the specified path
df = pd.read_csv('/content/fake_real.csv', sep=',', encoding='utf-8', quotechar='"')

# Split the texts into batches
batch_size = 100  # Adjust as needed
num_batches = len(df) // batch_size + 1
processed_texts = []
for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, len(df))
    batch_texts = df['text'][start_idx:end_idx].tolist()
    batch_processed_texts = preprocess_batch(batch_texts)
    processed_texts.extend(batch_processed_texts)

# Add the processed texts to the DataFrame
df['processed_Text'] = processed_texts

# Display the DataFrame after text processing
df.head()

Unnamed: 0,title,text,subject,date,type,processed_Text
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",True,washington reuter head conservative republican...
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",True,washington reuters transgender people allow ti...
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",True,washington reuter special counsel investigatio...
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",True,washington reuters trump campaign adviser geor...
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",True,seattlewashington reuters president donald tru...


In this function:

1. We use spaCy for tokenization and lemmatization to take advantage of its efficiency and accuracy.
2. We use NLTK for additional tokenization, stopwords removal, and lemmatization.
3. We combine tokens from both NLTK and spaCy to leverage the strengths of both libraries.
4. Finally, we join the processed tokens back into a single string.

In [19]:
# Save DataFrame to a CSV file
df.to_csv('preprocess_text_Spacy.csv', index=False)

In [20]:
# Create a download link
from IPython.display import FileLink
FileLink('preprocess_text_Spacy.csv')