In [10]:
!pip install nltk scikit-learn pandas




In [11]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [12]:
import re
import pandas as pd

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer


In [13]:
data = {
    "text": [
        "Natural Language Processing is very powerful!",
        "Machine learning is used in NLP.",
        "Text data requires proper cleaning and preprocessing."
    ],
    "label": ["Tech", "Tech", "Data"]
}

df = pd.DataFrame(data)
df


Unnamed: 0,text,label
0,Natural Language Processing is very powerful!,Tech
1,Machine learning is used in NLP.,Tech
2,Text data requires proper cleaning and preproc...,Data


In [14]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

df["clean_text"] = df["text"].apply(clean_text)
df


Unnamed: 0,text,label,clean_text
0,Natural Language Processing is very powerful!,Tech,natural language processing is very powerful
1,Machine learning is used in NLP.,Tech,machine learning is used in nlp
2,Text data requires proper cleaning and preproc...,Data,text data requires proper cleaning and preproc...


In [15]:
import nltk
nltk.download('punkt_tab')
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = word_tokenize(text)
    return " ".join(lemmatizer.lemmatize(token) for token in tokens)

df["lemmatized_text"] = df["clean_text"].apply(lemmatize_text)
df

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,text,label,clean_text,lemmatized_text
0,Natural Language Processing is very powerful!,Tech,natural language processing is very powerful,natural language processing is very powerful
1,Machine learning is used in NLP.,Tech,machine learning is used in nlp,machine learning is used in nlp
2,Text data requires proper cleaning and preproc...,Data,text data requires proper cleaning and preproc...,text data requires proper cleaning and preproc...


In [16]:
stop_words = set(stopwords.words("english"))

def remove_stopwords(text):
    tokens = word_tokenize(text)
    return " ".join(word for word in tokens if word not in stop_words)

df["final_text"] = df["lemmatized_text"].apply(remove_stopwords)
df


Unnamed: 0,text,label,clean_text,lemmatized_text,final_text
0,Natural Language Processing is very powerful!,Tech,natural language processing is very powerful,natural language processing is very powerful,natural language processing powerful
1,Machine learning is used in NLP.,Tech,machine learning is used in nlp,machine learning is used in nlp,machine learning used nlp
2,Text data requires proper cleaning and preproc...,Data,text data requires proper cleaning and preproc...,text data requires proper cleaning and preproc...,text data requires proper cleaning preprocessing


In [17]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Re-defining df to ensure it's available in this execution context
data = {
    "text": [
        "Natural Language Processing is very powerful!",
        "Machine learning is used in NLP.",
        "Text data requires proper cleaning and preprocessing."
    ],
    "label": ["Tech", "Tech", "Data"]
}
df = pd.DataFrame(data)

label_encoder = LabelEncoder()
df["encoded_label"] = label_encoder.fit_transform(df["label"])
df

Unnamed: 0,text,label,encoded_label
0,Natural Language Processing is very powerful!,Tech,1
1,Machine learning is used in NLP.,Tech,1
2,Text data requires proper cleaning and preproc...,Data,0


In [18]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Ensure NLTK resources are downloaded if they were not in the current session
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True)

# Re-apply text cleaning functions to ensure 'final_text' exists on the current df
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    tokens = word_tokenize(text)
    return " ".join(lemmatizer.lemmatize(token) for token in tokens)

stop_words = set(stopwords.words("english"))
def remove_stopwords(text):
    tokens = word_tokenize(text)
    return " ".join(word for word in tokens if word not in stop_words)

df["clean_text"] = df["text"].apply(clean_text)
df["lemmatized_text"] = df["clean_text"].apply(lemmatize_text)
df["final_text"] = df["lemmatized_text"].apply(remove_stopwords)

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df["final_text"])

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

tfidf_df

Unnamed: 0,cleaning,data,language,learning,machine,natural,nlp,powerful,preprocessing,processing,proper,requires,text,used
0,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0.5,0.0,0.5,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.5,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.5
2,0.408248,0.408248,0.0,0.0,0.0,0.0,0.0,0.0,0.408248,0.0,0.408248,0.408248,0.408248,0.0


In [19]:
df.to_csv("processed_text_data.csv", index=False)
tfidf_df.to_csv("tfidf_features.csv", index=False)
