In [1]:
import pandas as pd
import numpy as np
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
data = {
    "text": [
        "I love Natural Language Processing!",
        "NLP is used in chatbots and search engines.",
        "Text cleaning is an important step in NLP.",
        "Machine learning models need clean data."
    ],
    "label": ["positive", "positive", "neutral", "neutral"]
}

df = pd.DataFrame(data)
df


Unnamed: 0,text,label
0,I love Natural Language Processing!,positive
1,NLP is used in chatbots and search engines.,positive
2,Text cleaning is an important step in NLP.,neutral
3,Machine learning models need clean data.,neutral


In [4]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df["clean_text"] = df["text"].apply(clean_text)
df


Unnamed: 0,text,label,clean_text
0,I love Natural Language Processing!,positive,i love natural language processing
1,NLP is used in chatbots and search engines.,positive,nlp is used in chatbots and search engines
2,Text cleaning is an important step in NLP.,neutral,text cleaning is an important step in nlp
3,Machine learning models need clean data.,neutral,machine learning models need clean data


In [5]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

df["processed_text"] = df["clean_text"].apply(preprocess_text)
df


Unnamed: 0,text,label,clean_text,processed_text
0,I love Natural Language Processing!,positive,i love natural language processing,love natural language processing
1,NLP is used in chatbots and search engines.,positive,nlp is used in chatbots and search engines,nlp used chatbots search engine
2,Text cleaning is an important step in NLP.,neutral,text cleaning is an important step in nlp,text cleaning important step nlp
3,Machine learning models need clean data.,neutral,machine learning models need clean data,machine learning model need clean data


In [6]:
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["label"])
df


Unnamed: 0,text,label,clean_text,processed_text,label_encoded
0,I love Natural Language Processing!,positive,i love natural language processing,love natural language processing,1
1,NLP is used in chatbots and search engines.,positive,nlp is used in chatbots and search engines,nlp used chatbots search engine,1
2,Text cleaning is an important step in NLP.,neutral,text cleaning is an important step in nlp,text cleaning important step nlp,0
3,Machine learning models need clean data.,neutral,machine learning models need clean data,machine learning model need clean data,0


In [7]:
tfidf = TfidfVectorizer()

X_tfidf = tfidf.fit_transform(df["processed_text"])

tfidf_df = pd.DataFrame(
    X_tfidf.toarray(),
    columns=tfidf.get_feature_names_out()
)

tfidf_df


Unnamed: 0,chatbots,clean,cleaning,data,engine,important,language,learning,love,machine,model,natural,need,nlp,processing,search,step,text,used
0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0
1,0.465162,0.0,0.0,0.0,0.465162,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.366739,0.0,0.465162,0.0,0.0,0.465162
2,0.0,0.0,0.465162,0.0,0.0,0.465162,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.366739,0.0,0.0,0.465162,0.465162,0.0
3,0.0,0.408248,0.0,0.408248,0.0,0.0,0.0,0.408248,0.0,0.408248,0.408248,0.0,0.408248,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
df.to_csv("cleaned_text_data.csv", index=False)
tfidf_df.to_csv("tfidf_features.csv", index=False)

print("Files saved successfully!")

Files saved successfully!
