**Perform text cleaning, perform lemmatization (any method), remove stop words (any method), label encoding. Create representations using TF-IDF. Save outputs**

In [None]:
pip install nltk pandas scikit-learn




In [None]:
import re
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
data = {
    "Text": [
        "Natural Language Processing is 12 amazing!",
        "Machine learning and deep 43 learning are subfields of AI.",
        "Natural Language Processing (NLP) is 98 part of AI and Machine Learning.",
        "Deep @ learning improves NLP tasks.",
        "AI is revolutionizing the world!"
    ],
    "Label": ["Positive", "Neutral", "Neutral", "Positive", "Positive"]
}

In [None]:
# Convert to DataFrame
df = pd.DataFrame(data)


In [None]:
df.head()

Unnamed: 0,Text,Label
0,Natural Language Processing is 12 amazing!,Positive
1,Machine learning and deep 43 learning are subf...,Neutral
2,Natural Language Processing (NLP) is 98 part o...,Neutral
3,Deep @ learning improves NLP tasks.,Positive
4,AI is revolutionizing the world!,Positive


**Text Cleaning**

Converts text to lowercase.
Removes numbers, punctuation, and extra spaces.

In [None]:
# 1. Text Cleaning Function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

In [None]:
df["Cleaned_Text"] = df["Text"].apply(clean_text)

In [None]:
df.head()

Unnamed: 0,Text,Label,Cleaned_Text
0,Natural Language Processing is 12 amazing!,Positive,natural language processing is amazing
1,Machine learning and deep 43 learning are subf...,Neutral,machine learning and deep learning are subfiel...
2,Natural Language Processing (NLP) is 98 part o...,Neutral,natural language processing nlp is part of ai ...
3,Deep @ learning improves NLP tasks.,Positive,deep learning improves nlp tasks
4,AI is revolutionizing the world!,Positive,ai is revolutionizing the world


**Stopword Removal**

Removes common words like "is", "and", "the", etc.


In [None]:
nltk.download('punkt_tab')
# 2. Remove Stopwords
stop_words = set(stopwords.words("english"))
df["No_Stopwords"] = df["Cleaned_Text"].apply(lambda text: ' '.join([word for word in word_tokenize(text) if word not in stop_words]))


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


**Lemmatization**

Converts words into their root form

In [None]:
# 3. Lemmatization
lemmatizer = WordNetLemmatizer()
df["Lemmatized_Text"] = df["No_Stopwords"].apply(lambda text: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text)]))


**Label Encoding**

Converts categorical labels (Positive, Neutral) into numerical values.

In [None]:
# 4. Label Encoding
label_encoder = LabelEncoder()
df["Encoded_Label"] = label_encoder.fit_transform(df["Label"])

**TF-IDF Representation**

Converts text into a vectorized format for NLP tasks.

In [None]:
# 5. TF-IDF Representation
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df["Lemmatized_Text"])

In [None]:
# Convert TF-IDF matrix to DataFrame
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())


**Saving Outputs**

In [None]:
# 6. Save Outputs
df.to_csv("text_preprocessing_results.csv", index=False)
tfidf_df.to_csv("tfidf_representation.csv", index=False)

In [None]:
# Print final outputs
print("\n--- Cleaned & Processed Data ---")
print(df)



--- Cleaned & Processed Data ---
                                                Text     Label  \
0         Natural Language Processing is 12 amazing!  Positive   
1  Machine learning and deep 43 learning are subf...   Neutral   
2  Natural Language Processing (NLP) is 98 part o...   Neutral   
3                Deep @ learning improves NLP tasks.  Positive   
4                   AI is revolutionizing the world!  Positive   

                                        Cleaned_Text  \
0             natural language processing is amazing   
1  machine learning and deep learning are subfiel...   
2  natural language processing nlp is part of ai ...   
3                   deep learning improves nlp tasks   
4                    ai is revolutionizing the world   

                                        No_Stopwords  \
0                natural language processing amazing   
1        machine learning deep learning subfields ai   
2  natural language processing nlp part ai machin...   
3       

In [None]:

print("\n--- TF-IDF Representation ---")
print(tfidf_df.head())



--- TF-IDF Representation ---
         ai   amazing      deep  improves  language  learning   machine  \
0  0.000000  0.581951  0.000000  0.000000  0.469515  0.000000  0.000000   
1  0.314159  0.000000  0.378464  0.000000  0.000000  0.628318  0.378464   
2  0.295064  0.000000  0.000000  0.000000  0.355460  0.295064  0.355460   
3  0.000000  0.000000  0.416607  0.516374  0.000000  0.345822  0.000000   
4  0.427993  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

    natural       nlp      part  processing  revolutionizing  subfields  \
0  0.469515  0.000000  0.000000    0.469515          0.00000   0.000000   
1  0.000000  0.000000  0.000000    0.000000          0.00000   0.469096   
2  0.355460  0.355460  0.440584    0.355460          0.00000   0.000000   
3  0.000000  0.416607  0.000000    0.000000          0.00000   0.000000   
4  0.000000  0.000000  0.000000    0.000000          0.63907   0.000000   

       task    world  
0  0.000000  0.00000  
1  0.000000  0.00000 