In [1]:
!pip install nltk scikit-learn pandas



In [2]:
#imports

import nltk
import re
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [3]:
# sample dataset

data = {
    "text": [
        "The battery life is amazing and the setup was very easy!",
        "I am disappointed with the connectivity, it keeps dropping.",
        "The device works as expected but the price is a bit high.",
        "Worst purchase ever, the app crashes every time I open it.",
        "Excellent build quality and very fast response times.",
        "It's an okay product, nothing special but does the job."
    ],
    "label": ["positive", "negative", "neutral", "negative", "positive", "neutral"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,text,label
0,The battery life is amazing and the setup was ...,positive
1,"I am disappointed with the connectivity, it ke...",negative
2,The device works as expected but the price is ...,neutral
3,"Worst purchase ever, the app crashes every tim...",negative
4,Excellent build quality and very fast response...,positive
5,"It's an okay product, nothing special but does...",neutral


In [4]:
# Text cleaning function

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

df['clean_text'] = df['text'].apply(clean_text)
df

Unnamed: 0,text,label,clean_text
0,The battery life is amazing and the setup was ...,positive,the battery life is amazing and the setup was ...
1,"I am disappointed with the connectivity, it ke...",negative,i am disappointed with the connectivity it kee...
2,The device works as expected but the price is ...,neutral,the device works as expected but the price is ...
3,"Worst purchase ever, the app crashes every tim...",negative,worst purchase ever the app crashes every time...
4,Excellent build quality and very fast response...,positive,excellent build quality and very fast response...
5,"It's an okay product, nothing special but does...",neutral,its an okay product nothing special but does t...


In [5]:
# Stop word removal
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    return " ".join([word for word in words if word not in stop_words])

df['no_stopwords'] = df['clean_text'].apply(remove_stopwords)
df

Unnamed: 0,text,label,clean_text,no_stopwords
0,The battery life is amazing and the setup was ...,positive,the battery life is amazing and the setup was ...,battery life amazing setup easy
1,"I am disappointed with the connectivity, it ke...",negative,i am disappointed with the connectivity it kee...,disappointed connectivity keeps dropping
2,The device works as expected but the price is ...,neutral,the device works as expected but the price is ...,device works expected price bit high
3,"Worst purchase ever, the app crashes every tim...",negative,worst purchase ever the app crashes every time...,worst purchase ever app crashes every time open
4,Excellent build quality and very fast response...,positive,excellent build quality and very fast response...,excellent build quality fast response times
5,"It's an okay product, nothing special but does...",neutral,its an okay product nothing special but does t...,okay product nothing special job


In [6]:
# Lemmatization
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = text.split()
    return " ".join([lemmatizer.lemmatize(word) for word in words])

df['lemmatized_text'] = df['no_stopwords'].apply(lemmatize_text)
df

Unnamed: 0,text,label,clean_text,no_stopwords,lemmatized_text
0,The battery life is amazing and the setup was ...,positive,the battery life is amazing and the setup was ...,battery life amazing setup easy,battery life amazing setup easy
1,"I am disappointed with the connectivity, it ke...",negative,i am disappointed with the connectivity it kee...,disappointed connectivity keeps dropping,disappointed connectivity keep dropping
2,The device works as expected but the price is ...,neutral,the device works as expected but the price is ...,device works expected price bit high,device work expected price bit high
3,"Worst purchase ever, the app crashes every tim...",negative,worst purchase ever the app crashes every time...,worst purchase ever app crashes every time open,worst purchase ever app crash every time open
4,Excellent build quality and very fast response...,positive,excellent build quality and very fast response...,excellent build quality fast response times,excellent build quality fast response time
5,"It's an okay product, nothing special but does...",neutral,its an okay product nothing special but does t...,okay product nothing special job,okay product nothing special job


In [7]:
# Label Encoding
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['label'])

df[['label', 'encoded_label']]

Unnamed: 0,label,encoded_label
0,positive,2
1,negative,0
2,neutral,1
3,negative,0
4,positive,2
5,neutral,1


In [8]:
# TF-IDF Representation
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['lemmatized_text'])

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf.get_feature_names_out()
)

tfidf_df

Unnamed: 0,amazing,app,battery,bit,build,connectivity,crash,device,disappointed,dropping,easy,ever,every,excellent,expected,fast,high,job,keep,life,nothing,okay,open,price,product,purchase,quality,response,setup,special,time,work,worst
0,0.447214,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.408248,0.0,0.0,0.0,0.408248,0.0,0.0,0.0,0.0,0.0,0.0,0.408248,0.0,0.408248,0.0,0.0,0.0,0.0,0.0,0.0,0.408248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.408248,0.0
3,0.0,0.361022,0.0,0.0,0.0,0.0,0.361022,0.0,0.0,0.0,0.0,0.361022,0.361022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.361022,0.0,0.0,0.361022,0.0,0.0,0.0,0.0,0.296043,0.0,0.361022
4,0.0,0.0,0.0,0.0,0.419871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.419871,0.0,0.419871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.419871,0.419871,0.0,0.0,0.3443,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.447214,0.447214,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0


In [9]:
# Combine Final Output
final_df = pd.concat([df, tfidf_df], axis=1)
final_df


Unnamed: 0,text,label,clean_text,no_stopwords,lemmatized_text,encoded_label,amazing,app,battery,bit,build,connectivity,crash,device,disappointed,dropping,easy,ever,every,excellent,expected,fast,high,job,keep,life,nothing,okay,open,price,product,purchase,quality,response,setup,special,time,work,worst
0,The battery life is amazing and the setup was ...,positive,the battery life is amazing and the setup was ...,battery life amazing setup easy,battery life amazing setup easy,2,0.447214,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0
1,"I am disappointed with the connectivity, it ke...",negative,i am disappointed with the connectivity it kee...,disappointed connectivity keeps dropping,disappointed connectivity keep dropping,0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,The device works as expected but the price is ...,neutral,the device works as expected but the price is ...,device works expected price bit high,device work expected price bit high,1,0.0,0.0,0.0,0.408248,0.0,0.0,0.0,0.408248,0.0,0.0,0.0,0.0,0.0,0.0,0.408248,0.0,0.408248,0.0,0.0,0.0,0.0,0.0,0.0,0.408248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.408248,0.0
3,"Worst purchase ever, the app crashes every tim...",negative,worst purchase ever the app crashes every time...,worst purchase ever app crashes every time open,worst purchase ever app crash every time open,0,0.0,0.361022,0.0,0.0,0.0,0.0,0.361022,0.0,0.0,0.0,0.0,0.361022,0.361022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.361022,0.0,0.0,0.361022,0.0,0.0,0.0,0.0,0.296043,0.0,0.361022
4,Excellent build quality and very fast response...,positive,excellent build quality and very fast response...,excellent build quality fast response times,excellent build quality fast response time,2,0.0,0.0,0.0,0.0,0.419871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.419871,0.0,0.419871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.419871,0.419871,0.0,0.0,0.3443,0.0,0.0
5,"It's an okay product, nothing special but does...",neutral,its an okay product nothing special but does t...,okay product nothing special job,okay product nothing special job,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.447214,0.447214,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0


In [10]:
# Save Output to File
final_df.to_csv("lab3_output.csv", index=False)
print("File Saved Successfully!")

File Saved Successfully!
