In [8]:
import pandas as pd
import nltk
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np

nltk.download('words')  # list of common english words
nltk.download('wordnet')  # large thesaurus with word relationships
nltk.download('punkt')  # required for tokenization
nltk.download('stopwords')  # list of words like "the" and "a"

[nltk_data] Downloading package words to
[nltk_data]     /Users/aryamantepal/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aryamantepal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aryamantepal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aryamantepal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
feedback_data = pd.read_csv("/Users/aryamantepal/Documents/programs/Breakthrough Tech AI MIT/AI Studio/Customer_Sentiment_Analysis/Datasets/feedbackData.csv", sep="\t")

print(feedback_data.head())

    Timestamp  Rating                                           Feedback  \
0  2024-01-05      10                                          adfjhrfb    
1  2024-04-27       1                                             test12   
2  2024-03-12       5                                    or480r fk qkefb   
3  2024-04-22       6  The service is decent, but it's not without it...   
4  2024-02-14       7  The new features are incredibly useful and eas...   

                               UserId  
0                       Reply@DHL.com  
1                    Reply@Prozac.com  
2  Reply@Lakeport Brewing Company.com  
3                  Reply@Informix.com  
4                     Reply@Gucci.com  


In [4]:
print(feedback_data['Feedback'].dtype)

feedback_data['Feedback'] = feedback_data['Feedback'].astype(str)

print(feedback_data['Feedback'].dtype)


object
object


Checking for Missing Values

In [10]:
nan_count = np.sum(feedback_data.isnull(), axis = 0)
nan_count

Timestamp    0
Rating       0
Feedback     0
UserId       0
dtype: int64

In [5]:
english_words = set(words.words()) # defined set outside of function to save time

def contains_english_word(text):
    text_words = set(text.lower().split())
    return bool(english_words.intersection(text_words))

filtered_data = feedback_data[feedback_data['Feedback'].apply(contains_english_word)]

print(filtered_data.head())

    Timestamp  Rating                                           Feedback  \
3  2024-04-22       6  The service is decent, but it's not without it...   
4  2024-02-14       7  The new features are incredibly useful and eas...   
5  2024-04-03       8  I can't say enough good things about this serv...   
6  2024-04-06       6             Does the job, but nothing outstanding.   
7  2024-02-18       2   The interface is confusing and hard to navigate.   

                   UserId  
3      Reply@Informix.com  
4         Reply@Gucci.com  
5    Reply@Lumencraft.com  
6  Reply@Martin Print.com  
7       Reply@Biotech.com  


In [6]:
## Lemmatizing to get words base form in new column

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = nltk.word_tokenize(text)  # tokenizing
    lemmas = [lemmatizer.lemmatize(token).lower() for token in tokens]
    return " ".join(lemmas)

filtered_data['lem_feedback'] = filtered_data['Feedback'].apply(lemmatize_text)

print(filtered_data.head())

    Timestamp  Rating                                           Feedback  \
3  2024-04-22       6  The service is decent, but it's not without it...   
4  2024-02-14       7  The new features are incredibly useful and eas...   
5  2024-04-03       8  I can't say enough good things about this serv...   
6  2024-04-06       6             Does the job, but nothing outstanding.   
7  2024-02-18       2   The interface is confusing and hard to navigate.   

                   UserId                                       lem_feedback  
3      Reply@Informix.com  the service is decent , but it 's not without ...  
4         Reply@Gucci.com  the new feature are incredibly useful and easy...  
5    Reply@Lumencraft.com  i ca n't say enough good thing about this serv...  
6  Reply@Martin Print.com           does the job , but nothing outstanding .  
7       Reply@Biotech.com  the interface is confusing and hard to navigate .  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['lem_feedback'] = filtered_data['Feedback'].apply(lemmatize_text)


In [30]:
## Removing stopwords to only have core words

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = nltk.word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

filtered_data['feedback_prepped'] = filtered_data['lem_feedback'].apply(remove_stopwords)

print(filtered_data.head())



    Timestamp  Rating                                           Feedback  \
3  2024-04-22       6  The service is decent, but it's not without it...   
4  2024-02-14       7  The new features are incredibly useful and eas...   
5  2024-04-03       8  I can't say enough good things about this serv...   
6  2024-04-06       6             Does the job, but nothing outstanding.   
7  2024-02-18       2   The interface is confusing and hard to navigate.   

                   UserId                                       lem_feedback  \
3      Reply@Informix.com  the service is decent , but it 's not without ...   
4         Reply@Gucci.com  the new feature are incredibly useful and easy...   
5    Reply@Lumencraft.com  i ca n't say enough good thing about this serv...   
6  Reply@Martin Print.com           does the job , but nothing outstanding .   
7       Reply@Biotech.com  the interface is confusing and hard to navigate .   

                                    feedback_prepped  
3  serv

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['feedback_prepped'] = filtered_data['lem_feedback'].apply(remove_stopwords)


In [33]:
filtered_data.to_csv('/feedback_data_cleaned.csv', index=False)