# Part 1: Text Cleaning or Preprocessing Steps

In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download("stopwords")
nltk.download("wordnet")

# ---------------------------------------------------
# Original Text
# ---------------------------------------------------
text = "Hey!!! I can't believe this... Visit https://abc.com NOW ðŸ˜²ðŸ˜²!!! It's sooo cooool!!! Loving"
print("1) Original:", text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


1) Original: Hey!!! I can't believe this... Visit https://abc.com NOW ðŸ˜²ðŸ˜²!!! It's sooo cooool!!!


In [2]:
# ---------------------------------------------------
# 1. Lowercasing
# ---------------------------------------------------
text = text.lower()
print("2) Lowercased:", text)

2) Lowercased: hey!!! i can't believe this... visit https://abc.com now ðŸ˜²ðŸ˜²!!! it's sooo cooool!!!


In [3]:
# ---------------------------------------------------
# 2. Remove Noise (punctuation, numbers, special chars)
# ---------------------------------------------------
text = re.sub(r"[^a-zA-Z\s]", " ", text)
text = re.sub(r"\s+", " ", text).strip()
print("3) Noise Removed:", text)

3) Noise Removed: hey i can t believe this visit https abc com now it s sooo cooool


In [4]:
# ---------------------------------------------------
# 3. Tokenization
# ---------------------------------------------------
tokens = text.split()
print("4) Tokenized:", tokens)

4) Tokenized: ['hey', 'i', 'can', 't', 'believe', 'this', 'visit', 'https', 'abc', 'com', 'now', 'it', 's', 'sooo', 'cooool']


In [5]:
# ---------------------------------------------------
# 4. Stopword Removal
# ---------------------------------------------------
stop = set(stopwords.words("english"))
tokens = [w for w in tokens if w not in stop]
print("5) Stopwords Removed:", tokens)


5) Stopwords Removed: ['hey', 'believe', 'visit', 'https', 'abc', 'com', 'sooo', 'cooool']


In [8]:
# ---------------------------------------------------
# 5. Stemming / Lemmatization
# ---------------------------------------------------
lemmatizer = WordNetLemmatizer()


lemmatized = [lemmatizer.lemmatize(w) for w in tokens]

print("6b) Lemmatized:", lemmatized)

6b) Lemmatized: ['hey', 'believe', 'visit', 'http', 'abc', 'com', 'sooo', 'cooool']


In [9]:
final_text = " ".join(lemmatized)
print("10) Final Cleaned Text:", final_text)

10) Final Cleaned Text: hey believe visit http abc com sooo cooool


# Part 2: Feature Extraction

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# --------------------------------------
# 3 Sample Sentences
# --------------------------------------
texts = [
    "Natural Language Processing is amazing!",
    "I love working with language models.",
    "Processing text data requires cleaning and vectorization."
]

# --------------------------------------
# Basic Cleaning (lowercase + remove special chars)
# --------------------------------------
cleaned_texts = []
for t in texts:
    t = t.lower()
    t = re.sub(r"[^a-zA-Z\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    cleaned_texts.append(t)

print("Cleaned Texts:", cleaned_texts)

# --------------------------------------
# TF-IDF Vectorization
# --------------------------------------
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(cleaned_texts)

# --------------------------------------
# Show Vocabulary + TF-IDF matrix
# --------------------------------------
print("\nVocabulary:", tfidf.get_feature_names_out())
print("\nTF-IDF Matrix:\n", tfidf_matrix.toarray())


Cleaned Texts: ['natural language processing is amazing', 'i love working with language models', 'processing text data requires cleaning and vectorization']

Vocabulary: ['amazing' 'and' 'cleaning' 'data' 'is' 'language' 'love' 'models'
 'natural' 'processing' 'requires' 'text' 'vectorization' 'with' 'working']

TF-IDF Matrix:
 [[0.49047908 0.         0.         0.         0.49047908 0.37302199
  0.         0.         0.49047908 0.37302199 0.         0.
  0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.35543247
  0.46735098 0.46735098 0.         0.         0.         0.
  0.         0.46735098 0.46735098]
 [0.         0.38988801 0.38988801 0.38988801 0.         0.
  0.         0.         0.         0.29651988 0.38988801 0.38988801
  0.38988801 0.         0.        ]]


# Part 3: Sentiment Analysis Project

In [11]:
# =========================================================
# 1. Imports
# =========================================================
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# =========================================================
# 2. Create 20-Sentence Dataset (Sentiment Labeled)
# =========================================================
data = {
    "text": [
        "I really loved this product, it works perfectly!",
        "This is the best service I have used.",
        "Amazing experience, highly recommended.",
        "I am very happy with the results.",
        "The quality is outstanding and I feel great.",
        "This made my day, absolutely fantastic.",
        "Superb performance, I would buy it again.",
        "Everything was perfect and smooth.",
        "I enjoyed using this tool a lot.",
        "This app is wonderful and very helpful.",

        "I hate this product, complete waste of money.",
        "Terrible experience, I will never return.",
        "The quality is bad and I am disappointed.",
        "Worst service ever, not recommended!",
        "I am unhappy and frustrated with this.",
        "This app crashes a lot, very annoying.",
        "I regret buying this item.",
        "Extremely slow and unresponsive.",
        "Not worth it, total disappointment.",
        "It caused many issues and wasted my time."
    ],
    "label": [
        1,1,1,1,1,1,1,1,1,1,
        0,0,0,0,0,0,0,0,0,0
    ]
}

df = pd.DataFrame(data)
df

Unnamed: 0,text,label
0,"I really loved this product, it works perfectly!",1
1,This is the best service I have used.,1
2,"Amazing experience, highly recommended.",1
3,I am very happy with the results.,1
4,The quality is outstanding and I feel great.,1
5,"This made my day, absolutely fantastic.",1
6,"Superb performance, I would buy it again.",1
7,Everything was perfect and smooth.,1
8,I enjoyed using this tool a lot.,1
9,This app is wonderful and very helpful.,1


In [13]:
# =========================================================
# 3. Basic Text Cleaning Function
# =========================================================
def clean_text(t):
    t = t.lower()
    t = re.sub(r"[^a-zA-Z\s]", " ", t)   # remove special chars
    t = re.sub(r"\s+", " ", t).strip()   # remove extra spaces
    return t

df["clean_text"] = df["text"].apply(clean_text)

df

Unnamed: 0,text,label,clean_text
0,"I really loved this product, it works perfectly!",1,i really loved this product it works perfectly
1,This is the best service I have used.,1,this is the best service i have used
2,"Amazing experience, highly recommended.",1,amazing experience highly recommended
3,I am very happy with the results.,1,i am very happy with the results
4,The quality is outstanding and I feel great.,1,the quality is outstanding and i feel great
5,"This made my day, absolutely fantastic.",1,this made my day absolutely fantastic
6,"Superb performance, I would buy it again.",1,superb performance i would buy it again
7,Everything was perfect and smooth.,1,everything was perfect and smooth
8,I enjoyed using this tool a lot.,1,i enjoyed using this tool a lot
9,This app is wonderful and very helpful.,1,this app is wonderful and very helpful


In [14]:
# =========================================================
# 4. Train-Test Split
# =========================================================
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_text"],
    df["label"],
    test_size=0.2,
    random_state=42
)


In [15]:
# =========================================================
# 5. TF-IDF Vectorization
# =========================================================
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [16]:
# =========================================================
# 6. Train Logistic Regression Classifier
# =========================================================
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# =========================================================
# 7. Evaluation
# =========================================================
y_pred = model.predict(X_test_tfidf)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.5

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.50      0.50         2
           1       0.50      0.50      0.50         2

    accuracy                           0.50         4
   macro avg       0.50      0.50      0.50         4
weighted avg       0.50      0.50      0.50         4

