In [10]:
#==========================
 #import libraries
#==========================
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [11]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Maheen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Maheen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Maheen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
# Function to preprocess text
def preprocess_text(text):
    #===================================
    # Check if text is not null or NaN
    #===================================
    if pd.isna(text):
        return ''
    
    #======================================================
    # Remove special characters, punctuation, and numbers
    #======================================================
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
     #===============================
    # Convert text to lowercase
    #===============================
    text = text.lower()
    
     #===============================
     # Tokenize the text into words
     #===============================
    words = word_tokenize(text)
    
     #===============================
     # Remove stop words
     #===============================
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    #===============================
     # Lemmatize the words
    #===============================
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    #==================================
    # Join the words back into a string
    #===================================
    preprocessed_text = ' '.join(words)
    return preprocessed_text

In [13]:

dataset_path = 'C:\\Users\\Maheen\\Downloads\\enwiki-20170820.csv'
output_csv_path = 'data.csv'
chunksize = 100000  

In [14]:
# Initialize an empty list to store preprocessed chunks
processed_chunks = []

In [15]:
#==========================
# Preprocess each chunk
#==========================
for chunk in pd.read_csv(dataset_path, chunksize=chunksize, usecols=['ARTICLE_ID', 'SECTION_TEXT']):
    # Preprocess SECTION_TEXT column
    chunk['SECTION_TEXT'] = chunk['SECTION_TEXT'].apply(preprocess_text)
    

In [16]:
processed_chunks.append(chunk)

In [17]:
#======================================================
# Concatenate preprocessed chunks into a single DataFrame
#======================================================
df_processed = pd.concat(processed_chunks)


In [18]:
#==========================
# Save preprocessed data 
#==========================
df_processed.to_csv(output_csv_path, index=False)
print("Preprocessing complete. Preprocessed data saved to:", output_csv_path)

Preprocessing complete. Preprocessed data saved to: data.csv
