In [2]:
import nltk
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\Abid
[nltk_data]     Khan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Abid
[nltk_data]     Khan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Abid
[nltk_data]     Khan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Make dictionary of data

In [4]:

data = {
    'text': [
        "Natural Language Processing (NLP) is a field of artificial intelligence!",
        "NLP involves the interaction between computers and humans using natural language.",
        "The ultimate goal of NLP is to enable computers to understand, interpret, and generate human language."
    ]
}

df = pd.DataFrame(data)
print(df)

                                                text
0  Natural Language Processing (NLP) is a field o...
1  NLP involves the interaction between computers...
2  The ultimate goal of NLP is to enable computer...


# Data Preprocessing

### Tokenized

In [5]:
df['tokenized'] = df['text'].apply(lambda x: word_tokenize(x))
print(df[['text', 'tokenized']])

                                                text  \
0  Natural Language Processing (NLP) is a field o...   
1  NLP involves the interaction between computers...   
2  The ultimate goal of NLP is to enable computer...   

                                           tokenized  
0  [Natural, Language, Processing, (, NLP, ), is,...  
1  [NLP, involves, the, interaction, between, com...  
2  [The, ultimate, goal, of, NLP, is, to, enable,...  


### lowercase

In [6]:
df['lowercase'] = df['text'].apply(lambda x: x.lower())
print(df[['text', 'lowercase']])

                                                text  \
0  Natural Language Processing (NLP) is a field o...   
1  NLP involves the interaction between computers...   
2  The ultimate goal of NLP is to enable computer...   

                                           lowercase  
0  natural language processing (nlp) is a field o...  
1  nlp involves the interaction between computers...  
2  the ultimate goal of nlp is to enable computer...  


### no punctuation 

In [7]:
df['no_punctuation'] = df['lowercase'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
print(df[['lowercase', 'no_punctuation']])

                                           lowercase  \
0  natural language processing (nlp) is a field o...   
1  nlp involves the interaction between computers...   
2  the ultimate goal of nlp is to enable computer...   

                                      no_punctuation  
0  natural language processing nlp is a field of ...  
1  nlp involves the interaction between computers...  
2  the ultimate goal of nlp is to enable computer...  


### no stopwords

In [8]:
stop_words = set(stopwords.words('english'))
df['no_stopwords'] = df['no_punctuation'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
print(df[['no_punctuation', 'no_stopwords']])

                                      no_punctuation  \
0  natural language processing nlp is a field of ...   
1  nlp involves the interaction between computers...   
2  the ultimate goal of nlp is to enable computer...   

                                        no_stopwords  
0  natural language processing nlp field artifici...  
1  nlp involves interaction computers humans usin...  
2  ultimate goal nlp enable computers understand ...  


### stemmed

In [9]:
stemmer = PorterStemmer()
df['stemmed'] = df['no_stopwords'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
print(df[['no_stopwords', 'stemmed']])

                                        no_stopwords  \
0  natural language processing nlp field artifici...   
1  nlp involves interaction computers humans usin...   
2  ultimate goal nlp enable computers understand ...   

                                             stemmed  
0  natur languag process nlp field artifici intellig  
1  nlp involv interact comput human use natur lan...  
2  ultim goal nlp enabl comput understand interpr...  


### lemmatized 

In [10]:
lemmatizer = WordNetLemmatizer()
df['lemmatized'] = df['no_stopwords'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
print(df[['no_stopwords', 'lemmatized']])

                                        no_stopwords  \
0  natural language processing nlp field artifici...   
1  nlp involves interaction computers humans usin...   
2  ultimate goal nlp enable computers understand ...   

                                          lemmatized  
0  natural language processing nlp field artifici...  
1  nlp involves interaction computer human using ...  
2  ultimate goal nlp enable computer understand i...  


### no special character

In [11]:
df['no_special_char'] = df['lemmatized'].apply(lambda x: re.sub(r'\d+', '', x))
print(df[['lemmatized', 'no_special_char']])

                                          lemmatized  \
0  natural language processing nlp field artifici...   
1  nlp involves interaction computer human using ...   
2  ultimate goal nlp enable computer understand i...   

                                     no_special_char  
0  natural language processing nlp field artifici...  
1  nlp involves interaction computer human using ...  
2  ultimate goal nlp enable computer understand i...  


### cleaned text 

In [15]:
df['cleaned_text'] = df['no_special_char'].apply(lambda x: ' '.join(x.split()))
print(df[['no_special_char', 'cleaned_text']])

                                     no_special_char  \
0  natural language processing nlp field artifici...   
1  nlp involves interaction computer human using ...   
2  ultimate goal nlp enable computer understand i...   

                                        cleaned_text  
0  natural language processing nlp field artifici...  
1  nlp involves interaction computer human using ...  
2  ultimate goal nlp enable computer understand i...  


# Importing Dataset

In [17]:
import pandas as pd
file_path = 'ECO.csv'
df = pd.read_csv(file_path)

df.head()


Unnamed: 0,Index,Reviews
0,1,Alexa cannot hear after she starts playing
1,2,I purchased this as a birthday gift for my 7 y...
2,3,"/*Here I'm Uploading video, enjoy*/Most idioti..."
3,4,Do not buy this product. When i asked alexa t...
4,5,Its just one if the best deal i ever got on am...


In [18]:
df.info()

df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4919 entries, 0 to 4918
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Index    4919 non-null   int64 
 1   Reviews  4918 non-null   object
dtypes: int64(1), object(1)
memory usage: 77.0+ KB


Unnamed: 0,Index,Reviews
0,1,Alexa cannot hear after she starts playing
1,2,I purchased this as a birthday gift for my 7 y...
2,3,"/*Here I'm Uploading video, enjoy*/Most idioti..."
3,4,Do not buy this product. When i asked alexa t...
4,5,Its just one if the best deal i ever got on am...


In [20]:
text_column = 'Reviews'

# Preprocessing steps
df['tokenized'] = df[text_column].apply(lambda x: word_tokenize(str(x)))
df['lowercase'] = df[text_column].apply(lambda x: str(x).lower())
df['no_punctuation'] = df['lowercase'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
stop_words = set(stopwords.words('english'))
df['no_stopwords'] = df['no_punctuation'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
stemmer = PorterStemmer()
df['stemmed'] = df['no_stopwords'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
lemmatizer = WordNetLemmatizer()
df['lemmatized'] = df['no_stopwords'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
df['no_special_char'] = df['lemmatized'].apply(lambda x: re.sub(r'\d+', '', x))
df['cleaned_text'] = df['no_special_char'].apply(lambda x: ' '.join(x.split()))

# Display the first few rows of the dataframe with preprocessing applied
df_display = df[[text_column, 'tokenized', 'lowercase', 'no_punctuation', 'no_stopwords', 'stemmed', 'lemmatized', 'cleaned_text']]
df_display.head()

Unnamed: 0,Reviews,tokenized,lowercase,no_punctuation,no_stopwords,stemmed,lemmatized,cleaned_text
0,Alexa cannot hear after she starts playing,"[Alexa, can, not, hear, after, she, starts, pl...",alexa cannot hear after she starts playing,alexa cannot hear after she starts playing,alexa cannot hear starts playing,alexa cannot hear start play,alexa cannot hear start playing,alexa cannot hear start playing
1,I purchased this as a birthday gift for my 7 y...,"[I, purchased, this, as, a, birthday, gift, fo...",i purchased this as a birthday gift for my 7 y...,i purchased this as a birthday gift for my 7 y...,purchased birthday gift 7 years old son since ...,purchas birthday gift 7 year old son sinc dont...,purchased birthday gift 7 year old son since d...,purchased birthday gift year old son since don...
2,"/*Here I'm Uploading video, enjoy*/Most idioti...","[/, *, Here, I, 'm, Uploading, video, ,, enjoy...","/*here i'm uploading video, enjoy*/most idioti...",here im uploading video enjoymost idiotic devi...,im uploading video enjoymost idiotic device ev...,im upload video enjoymost idiot devic everi bo...,im uploading video enjoymost idiotic device ev...,im uploading video enjoymost idiotic device ev...
3,Do not buy this product. When i asked alexa t...,"[Do, not, buy, this, product, ., When, i, aske...",do not buy this product. when i asked alexa t...,do not buy this product when i asked alexa th...,buy product asked alexa kashmir part country s...,buy product ask alexa kashmir part countri sai...,buy product asked alexa kashmir part country s...,buy product asked alexa kashmir part country s...
4,Its just one if the best deal i ever got on am...,"[Its, just, one, if, the, best, deal, i, ever,...",its just one if the best deal i ever got on am...,its just one if the best deal i ever got on am...,one best deal ever got amazon purchased 12 wat...,one best deal ever got amazon purchas 12 watt ...,one best deal ever got amazon purchased 12 wat...,one best deal ever got amazon purchased watt w...
