In [1]:
import pandas as pd

In [2]:
# Check if there are available GPU devices
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        tf.config.set_visible_devices(gpus[0], 'GPU')
        print(f"Using GPU: {gpus[0]}")
    except RuntimeError as e:
        print(e)

Using GPU: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [3]:
# data = pd.read_csv(r"D:\projects\Abstactive Text Summarization\archive\data.csv")

# Read the first 10,000 rows  beacuse its a amassive data
data = pd.read_csv(r"D:\projects\Abstactive Text Summarization\archive\data.csv", nrows=10000)

In [4]:
# Create a copy of the dataset to avoid modifying the original data
df = data.copy()

In [5]:
# Count of missing values per column
print(df.isnull().sum())

Unnamed: 0      0
ID            643
Content         0
Summary         0
Dataset         0
dtype: int64


In [6]:
# Delete rows where Content is NaN
df = df.dropna(subset=['Content'])

In [7]:
# Reset index after deletion
df.reset_index(drop=True, inplace=True)

In [8]:
print(df.isnull().sum())

Unnamed: 0      0
ID            643
Content         0
Summary         0
Dataset         0
dtype: int64


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  10000 non-null  int64 
 1   ID          9357 non-null   object
 2   Content     10000 non-null  object
 3   Summary     10000 non-null  object
 4   Dataset     10000 non-null  object
dtypes: int64(1), object(4)
memory usage: 390.8+ KB


In [10]:
# Remove any duplicated rows to clean the data
df.drop_duplicates(inplace = True)

In [11]:
df.columns

Index(['Unnamed: 0', 'ID', 'Content', 'Summary', 'Dataset'], dtype='object')

In [12]:
df.head()

Unnamed: 0.1,Unnamed: 0,ID,Content,Summary,Dataset
0,0,f49ee725a0360aa6881ed1f7999cc531885dd06a,New York police are concerned drones could bec...,Police have investigated criminals who have ri...,CNN/Daily Mail
1,1,808fe317a53fbd3130c9b7563341a7eea6d15e94,By . Ryan Lipman . Perhaps Australian porn sta...,Porn star Angela White secretly filmed sex act...,CNN/Daily Mail
2,2,98fd67bd343e58bc4e275bbb5a4ea454ec827c0d,"This was, Sergio Garcia conceded, much like be...",American draws inspiration from fellow country...,CNN/Daily Mail
3,3,e12b5bd7056287049d9ec98e41dbb287bd19a981,An Ebola outbreak that began in Guinea four mo...,World Health Organisation: 635 infections and ...,CNN/Daily Mail
4,4,b83e8bcfcd51419849160e789b6658b21a9aedcd,By . Associated Press and Daily Mail Reporter ...,A sinkhole opened up at 5:15am this morning in...,CNN/Daily Mail


In [13]:
# Drop unnecessary columns
df = df.drop(["Unnamed: 0","ID","Dataset"],axis=1)

In [14]:
df.head()

Unnamed: 0,Content,Summary
0,New York police are concerned drones could bec...,Police have investigated criminals who have ri...
1,By . Ryan Lipman . Perhaps Australian porn sta...,Porn star Angela White secretly filmed sex act...
2,"This was, Sergio Garcia conceded, much like be...",American draws inspiration from fellow country...
3,An Ebola outbreak that began in Guinea four mo...,World Health Organisation: 635 infections and ...
4,By . Associated Press and Daily Mail Reporter ...,A sinkhole opened up at 5:15am this morning in...


In [15]:
import re
import unicodedata
import contractions
import logging
from nltk.corpus import stopwords

# Initialize logging to capture any warnings
logging.basicConfig(level=logging.WARNING)

# Load English stopwords from NLTK library
stop_words = set(stopwords.words('english'))

def text_cleaner(text, num):
    
    newString = text.lower()

    # Remove any content within parentheses 
    newString = re.sub(r'\([^)]*\)', '', newString)

    # Remove double quotes from the text
    newString = re.sub('"', '', newString)

    # Expand contractions 
    newString = contractions.fix(newString)

    # Remove possessive 's 
    newString = re.sub(r"'s\b", "", newString)

    # Normalize unicode characters to their closest ASCII equivalent
    newString = unicodedata.normalize('NFKD', newString).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    # Remove URLs 
    newString = re.sub(r'https?://\S+|www\.\S+', '', newString)

    # Remove non-alphabetical characters
    newString = re.sub("[^a-zA-Z]", " ", newString)

    # Normalize repeated characters
    newString = re.sub(r'(.)\1{2,}', r'\1\1', newString)

    # Remove newlines and excessive spaces
    newString = re.sub(r'\s+', ' ', newString).strip()

    # Split the text into tokens (words)
    tokens = newString.split()

    # remove stopwords from the tokens list
    if num == 0:
        tokens = [w for w in tokens if w not in stop_words]

    # Keep words with more than one character, but keep "a" and "i"
    long_words = [w for w in tokens if len(w) > 1 or w in ['a', 'i']]

    # Return the cleaned text as a space-separated string
    return ' '.join(long_words)
