In [1]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import warnings
# Ignore warnings
warnings.filterwarnings("ignore")

In [2]:
X_train = pd.read_csv ('..\\Spliting Data\\X_train.csv')['Data']
y_train = pd.read_csv ('..\\Spliting Data\\y_train.csv')['Data']

X_test = pd.read_csv  ('..\\Spliting Data\\X_test.csv')['Data']
y_test = pd.read_csv ('..\\Spliting Data\\y_test.csv')['Data']

X_val = pd.read_csv   ('..\\Spliting Data\\X_val.csv')['Data']
y_val = pd.read_csv ('..\\Spliting Data\\y_val.csv')['Data']

In [3]:
print(X_train.shape)
print(y_train.shape)
print('-----------------')
print(X_test.shape)
print(y_test.shape)
print('-----------------')
print(X_val.shape)
print(y_val.shape)

(269153,)
(269153,)
-----------------
(41502,)
(41502,)
-----------------
(41502,)
(41502,)


In [4]:
lemmatizer = WordNetLemmatizer()

stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()  # Convert to lowercase
    
    tokens = nltk.word_tokenize(text)  # Tokenize the text into word tokens
    
    tokens = [token for token in tokens if token.isalpha()]  # Remove punctuation and numbers
    
    tokens = [token for token in tokens if token not in stop_words]  # Remove stopwords
    
    tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatization
    
    return ' '.join(tokens)

In [5]:
X_train = [preprocess(text) for text in X_train]

In [6]:
X_test = [preprocess(text) for text in X_test]

In [7]:
X_val = [preprocess(text) for text in X_val]

In [8]:
train_dataframe = pd.DataFrame ({'headline': X_train , 'category' :y_train })

In [9]:
test_dataframe = pd.DataFrame ({'headline': X_test , 'category' :y_test })

In [10]:
val_dataframe = pd.DataFrame ({'headline': X_val , 'category' :y_val })

In [11]:
print("Number of duplicate headlines in train = ", train_dataframe.duplicated(['headline'], keep = 'last').sum())
print("Number of duplicate headlines in val   = ", test_dataframe.duplicated(['headline'], keep = 'last').sum())
print("Number of duplicate headlines in val   = ", val_dataframe.duplicated(['headline'], keep = 'last').sum())

Number of duplicate headlines in train =  5181
Number of duplicate headlines in val   =  183
Number of duplicate headlines in val   =  149


In [12]:
#Removing duplicates
#inplace=True: This argument modifies the DataFrame in place, meaning it doesn't
#return a new DataFrame but rather changes the original DataFrame (data) directly.

train_dataframe.drop_duplicates(['headline'], keep = 'last', inplace = True)
test_dataframe.drop_duplicates(['headline'], keep = 'last', inplace = True)
val_dataframe.drop_duplicates(['headline'], keep = 'last', inplace = True)

In [13]:
print("Total number of rows after removing duplicates in train = ",train_dataframe.shape[0])
print("Total number of rows after removing duplicates in test  = ",test_dataframe.shape[0])
print("Total number of rows after removing duplicates in val   = ",val_dataframe.shape[0])

Total number of rows after removing duplicates in train =  263972
Total number of rows after removing duplicates in test  =  41319
Total number of rows after removing duplicates in val   =  41353


In [14]:
print(train_dataframe.isnull().sum())
print(test_dataframe.isnull().sum())
print(val_dataframe.isnull().sum())

headline    0
category    0
dtype: int64
headline    0
category    0
dtype: int64
headline    0
category    0
dtype: int64


In [15]:
#Blank headlines
train_dataframe[train_dataframe['headline'] == '']

Unnamed: 0,headline,category
267558,,OTHER6


In [16]:
test_dataframe[test_dataframe['headline'] == '']

Unnamed: 0,headline,category
40259,,WELLNESS


In [17]:
val_dataframe[val_dataframe['headline'] == '']

Unnamed: 0,headline,category
37914,,WELLNESS


In [18]:
train_dataframe = train_dataframe[~(train_dataframe['headline'] == '')]
test_dataframe = test_dataframe[~(test_dataframe['headline'] == '')]
val_dataframe = val_dataframe[~(val_dataframe['headline'] == '')]

In [19]:
X_train = train_dataframe['headline']
y_train = train_dataframe['category']

X_test =  test_dataframe['headline']
y_test =  test_dataframe['category']

X_val =  val_dataframe['headline']
y_val =  val_dataframe['category']

In [20]:
# Convert the list of strings to a DataFrame
df = pd.DataFrame(list(X_train), columns=['Data'])

# Specify the CSV file path
csv_file_path = "Tokens Spliting Data\\X_train.csv"

# Write DataFrame to CSV file
df.to_csv(csv_file_path, index=False)

In [21]:
# Convert the list of strings to a DataFrame
df = pd.DataFrame(list(y_train), columns=['Data'])

# Specify the CSV file path
csv_file_path = "Tokens Spliting Data\\y_train.csv"

# Write DataFrame to CSV file
df.to_csv(csv_file_path, index=False)

In [22]:
# Convert the list of strings to a DataFrame
df = pd.DataFrame(list(X_test), columns=['Data'])

# Specify the CSV file path
csv_file_path = "Tokens Spliting Data\\X_test.csv"

# Write DataFrame to CSV file
df.to_csv(csv_file_path, index=False)

In [23]:
# Convert the list of strings to a DataFrame
df = pd.DataFrame(list(y_test), columns=['Data'])

# Specify the CSV file path
csv_file_path = "Tokens Spliting Data\\y_test.csv"

# Write DataFrame to CSV file
df.to_csv(csv_file_path, index=False)

In [24]:
# Convert the list of strings to a DataFrame
df = pd.DataFrame(list(X_val), columns=['Data'])

# Specify the CSV file path
csv_file_path = "Tokens Spliting Data\\X_val.csv"

# Write DataFrame to CSV file
df.to_csv(csv_file_path, index=False)

In [25]:
# Convert the list of strings to a DataFrame
df = pd.DataFrame(list(y_val), columns=['Data'])

# Specify the CSV file path
csv_file_path = "Tokens Spliting Data\\y_val.csv"

# Write DataFrame to CSV file
df.to_csv(csv_file_path, index=False)