In [None]:
!pip install nltk scikit-learn pandas



In [None]:
import re
import pandas as pd
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
data = {
    "text": [
        "I love Natural Language Processing!",
        "This course is very useful and interesting.",
        "I dislike boring lectures."
    ],
    "label": ["positive", "positive", "negative"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,text,label
0,I love Natural Language Processing!,positive
1,This course is very useful and interesting.,positive
2,I dislike boring lectures.,negative


In [None]:
def clean_text(text):
    text = text.lower()                 # lowercase
    text = re.sub(r'[^a-z\s]', '', text) # remove punctuation & numbers
    text = re.sub(r'\s+', ' ', text)     # remove extra spaces
    return text.strip()

In [None]:
df["clean_text"] = df["text"].apply(clean_text)
df

Unnamed: 0,text,label,clean_text
0,I love Natural Language Processing!,positive,i love natural language processing
1,This course is very useful and interesting.,positive,this course is very useful and interesting
2,I dislike boring lectures.,negative,i dislike boring lectures


In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [None]:
df["lemmatized_text"] = df["clean_text"].apply(lemmatize_text)
df

Unnamed: 0,text,label,clean_text,lemmatized_text
0,I love Natural Language Processing!,positive,i love natural language processing,i love natural language processing
1,This course is very useful and interesting.,positive,this course is very useful and interesting,this course is very useful and interesting
2,I dislike boring lectures.,negative,i dislike boring lectures,i dislike boring lecture


In [None]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stop_words])

In [None]:
df["final_text"] = df["lemmatized_text"].apply(remove_stopwords)
df

Unnamed: 0,text,label,clean_text,lemmatized_text,final_text
0,I love Natural Language Processing!,positive,i love natural language processing,i love natural language processing,love natural language processing
1,This course is very useful and interesting.,positive,this course is very useful and interesting,this course is very useful and interesting,course useful interesting
2,I dislike boring lectures.,negative,i dislike boring lectures,i dislike boring lecture,dislike boring lecture


In [None]:
encoder = LabelEncoder()
df["label_encoded"] = encoder.fit_transform(df["label"])
df

Unnamed: 0,text,label,clean_text,lemmatized_text,final_text,label_encoded
0,I love Natural Language Processing!,positive,i love natural language processing,i love natural language processing,love natural language processing,1
1,This course is very useful and interesting.,positive,this course is very useful and interesting,this course is very useful and interesting,course useful interesting,1
2,I dislike boring lectures.,negative,i dislike boring lectures,i dislike boring lecture,dislike boring lecture,0


In [None]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df["final_text"])

In [None]:
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf.get_feature_names_out()
)

tfidf_df

Unnamed: 0,boring,course,dislike,interesting,language,lecture,love,natural,processing,useful
0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.5,0.5,0.0
1,0.0,0.57735,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.57735
2,0.57735,0.0,0.57735,0.0,0.0,0.57735,0.0,0.0,0.0,0.0


In [None]:
df.to_csv("processed_text_data.csv", index=False)
tfidf_df.to_csv("tfidf_vectors.csv", index=False)