---
<h1>Data Pre-Processing</h1>

---

Importing necessary libraries

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

Loading Dataset

In [None]:
# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')

# Load your data
data = pd.read_csv(r'data.csv')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
data.head(5)

Unnamed: 0,links,content
0,https://doj.gov.in/,\n\n\n\n\n\n\n\n\n\nDEPARTMENT OF JUSTICE\nLat...
1,https://doj.gov.in/#,\n\n\n\n\n\n\n\n\n\nDEPARTMENT OF JUSTICE\nLat...
2,https://doj.gov.in/history/,Last updated: 13-01-2023\nAs per the Allocatio...
3,https://doj.gov.in/about-department/,Last updated: 19-04-2024\nAs per the Allocatio...
4,https://doj.gov.in/about-department/vision-and...,Facilitating administration of Justice that en...


Data Cleaning

In [None]:
data['Extracted Text'] = data['content'].str.replace(r'\r\n', ' ', regex=True)
data['Extracted Text'] = data['Extracted Text'].str.replace(r'\s+', ' ', regex=True).str.strip()

In [None]:
data.head()

Unnamed: 0,links,content,Extracted Text
0,https://doj.gov.in/,\n\n\n\n\n\n\n\n\n\nDEPARTMENT OF JUSTICE\nLat...,DEPARTMENT OF JUSTICE Latest News
1,https://doj.gov.in/#,\n\n\n\n\n\n\n\n\n\nDEPARTMENT OF JUSTICE\nLat...,DEPARTMENT OF JUSTICE Latest News
2,https://doj.gov.in/history/,Last updated: 13-01-2023\nAs per the Allocatio...,Last updated: 13-01-2023 As per the Allocation...
3,https://doj.gov.in/about-department/,Last updated: 19-04-2024\nAs per the Allocatio...,Last updated: 19-04-2024 As per the Allocation...
4,https://doj.gov.in/about-department/vision-and...,Facilitating administration of Justice that en...,Facilitating administration of Justice that en...


In [None]:
data.size

459

In [None]:
data['links'].unique().size

118

In [None]:
data = data.drop_duplicates(subset='links')
data = data[['links', 'Extracted Text']]
data.head(5)

Unnamed: 0,links,Extracted Text
0,https://doj.gov.in/,DEPARTMENT OF JUSTICE Latest News
1,https://doj.gov.in/#,DEPARTMENT OF JUSTICE Latest News
2,https://doj.gov.in/history/,Last updated: 13-01-2023 As per the Allocation...
3,https://doj.gov.in/about-department/,Last updated: 19-04-2024 As per the Allocation...
4,https://doj.gov.in/about-department/vision-and...,Facilitating administration of Justice that en...


Tokenization

In [None]:
data['Extracted Text'] = data['Extracted Text'].astype(str)

In [None]:
data['Tokens'] = data['Extracted Text'].apply(word_tokenize)
data.head(5)

Unnamed: 0,links,Extracted Text,Tokens
0,https://doj.gov.in/,DEPARTMENT OF JUSTICE Latest News,"[DEPARTMENT, OF, JUSTICE, Latest, News]"
1,https://doj.gov.in/#,DEPARTMENT OF JUSTICE Latest News,"[DEPARTMENT, OF, JUSTICE, Latest, News]"
2,https://doj.gov.in/history/,Last updated: 13-01-2023 As per the Allocation...,"[Last, updated, :, 13-01-2023, As, per, the, A..."
3,https://doj.gov.in/about-department/,Last updated: 19-04-2024 As per the Allocation...,"[Last, updated, :, 19-04-2024, As, per, the, A..."
4,https://doj.gov.in/about-department/vision-and...,Facilitating administration of Justice that en...,"[Facilitating, administration, of, Justice, th..."


Stopword Removal and Lemmitization

In [None]:
# Define the set of stop words
stop_words = set(stopwords.words('english'))

# Function to clean tokens
def clean_tokens(tokens):
    # Remove punctuation and special characters
    tokens = [re.sub(r'[^a-zA-Z0-9]', '', word) for word in tokens]
    # Remove empty strings that may result from cleaning
    tokens = [word for word in tokens if word]  # Remove empty strings
    # Remove stop words
    tokens = [word for word in tokens if word.lower() not in stop_words]
    return tokens

# Clean tokens and remove stop words
data['Tokens'] = data['Tokens'].apply(clean_tokens)
data.head(5)

Unnamed: 0,links,Extracted Text,Tokens
0,https://doj.gov.in/,DEPARTMENT OF JUSTICE Latest News,"[DEPARTMENT, JUSTICE, Latest, News]"
1,https://doj.gov.in/#,DEPARTMENT OF JUSTICE Latest News,"[DEPARTMENT, JUSTICE, Latest, News]"
2,https://doj.gov.in/history/,Last updated: 13-01-2023 As per the Allocation...,"[Last, updated, 13012023, per, Allocation, Bus..."
3,https://doj.gov.in/about-department/,Last updated: 19-04-2024 As per the Allocation...,"[Last, updated, 19042024, per, Allocation, Bus..."
4,https://doj.gov.in/about-department/vision-and...,Facilitating administration of Justice that en...,"[Facilitating, administration, Justice, ensure..."


In [None]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Apply lemmatization to the tokens
data['content'] = data['Tokens'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])

# Print the DataFrame to see the data after lemmatization
print("\nData After Lemmatization:")
data.head(5)  # Display the original text, tokens, and lemmatized tokens



Data After Lemmatization:


Unnamed: 0,links,Extracted Text,Tokens,Lemmatized Tokens,content
0,https://doj.gov.in/,DEPARTMENT OF JUSTICE Latest News,"[DEPARTMENT, JUSTICE, Latest, News]","[DEPARTMENT, JUSTICE, Latest, News]","[DEPARTMENT, JUSTICE, Latest, News]"
1,https://doj.gov.in/#,DEPARTMENT OF JUSTICE Latest News,"[DEPARTMENT, JUSTICE, Latest, News]","[DEPARTMENT, JUSTICE, Latest, News]","[DEPARTMENT, JUSTICE, Latest, News]"
2,https://doj.gov.in/history/,Last updated: 13-01-2023 As per the Allocation...,"[Last, updated, 13012023, per, Allocation, Bus...","[Last, updated, 13012023, per, Allocation, Bus...","[Last, updated, 13012023, per, Allocation, Bus..."
3,https://doj.gov.in/about-department/,Last updated: 19-04-2024 As per the Allocation...,"[Last, updated, 19042024, per, Allocation, Bus...","[Last, updated, 19042024, per, Allocation, Bus...","[Last, updated, 19042024, per, Allocation, Bus..."
4,https://doj.gov.in/about-department/vision-and...,Facilitating administration of Justice that en...,"[Facilitating, administration, Justice, ensure...","[Facilitating, administration, Justice, ensure...","[Facilitating, administration, Justice, ensure..."


In [None]:
data_cleaned = data[['links','content']]
data_cleaned

Unnamed: 0,links,content
0,https://doj.gov.in/,"[DEPARTMENT, JUSTICE, Latest, News]"
1,https://doj.gov.in/#,"[DEPARTMENT, JUSTICE, Latest, News]"
2,https://doj.gov.in/history/,"[Last, updated, 13012023, per, Allocation, Bus..."
3,https://doj.gov.in/about-department/,"[Last, updated, 19042024, per, Allocation, Bus..."
4,https://doj.gov.in/about-department/vision-and...,"[Facilitating, administration, Justice, ensure..."
...,...,...
146,https://doj.gov.in/videos-2/,"[Content, Owned, DEPARTMENT, JUSTICE, Ministry..."
148,https://doj.gov.in/category/press-release/,"[Download, PDF, 80KB, Download, PDF, 76KB, Con..."
149,https://doj.gov.in/photo-gallery/,"[Content, Owned, DEPARTMENT, JUSTICE, Ministry..."
150,https://doj.gov.in/video-gallery/,"[Content, Owned, DEPARTMENT, JUSTICE, Ministry..."
