Understand dataset structure and content

In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Reviews.csv')

# Display basic information about the DataFrame (columns, data types, non-null values)
print("DataFrame Info:")
df.info()

# Display the size of the DataFrame (number of rows, number of columns)
print("\nDataFrame Shape:")
print(df.shape)

# Display the first 5 sample records
print("\nFirst 5 Rows of the DataFrame:")
display(df.head())

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568428 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB

DataFrame Shape:
(568454, 10)

First 5 Rows of the DataFrame:


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


Select columns required for NLP and prediction.


In [3]:
# Select the desired columns: 'Summary', 'Text', and 'Score'
df = df[['Summary', 'Text', 'Score']]

# Display the first 5 rows of the updated DataFrame
print("\nFirst 5 Rows of the DataFrame after column selection:")
display(df.head())

# Display information about the updated DataFrame to confirm column changes
print("\nDataFrame Info after column selection:")
df.info()


First 5 Rows of the DataFrame after column selection:


Unnamed: 0,Summary,Text,Score
0,Good Quality Dog Food,I have bought several of the Vitality canned d...,5
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,1
2,"""Delight"" says it all",This is a confection that has been around a fe...,4
3,Cough Medicine,If you are looking for the secret ingredient i...,2
4,Great taffy,Great taffy at a great price. There was a wid...,5



DataFrame Info after column selection:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Summary  568427 non-null  object
 1   Text     568454 non-null  object
 2   Score    568454 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 13.0+ MB


Remove incomplete reviews.

In [4]:
# Remove rows where 'Summary' or 'Text' columns have missing values
df.dropna(subset=['Summary', 'Text'], inplace=True)

# Display information about the DataFrame after dropping nulls to confirm changes
print("\nDataFrame Info after dropping rows with missing Summary or Text:")
df.info()

# Display the new shape of the DataFrame
print("\nDataFrame Shape after dropping rows with missing Summary or Text:")
print(df.shape)


display(df.head())


DataFrame Info after dropping rows with missing Summary or Text:
<class 'pandas.core.frame.DataFrame'>
Index: 568427 entries, 0 to 568453
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Summary  568427 non-null  object
 1   Text     568427 non-null  object
 2   Score    568427 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 17.3+ MB

DataFrame Shape after dropping rows with missing Summary or Text:
(568427, 3)


Unnamed: 0,Summary,Text,Score
0,Good Quality Dog Food,I have bought several of the Vitality canned d...,5
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,1
2,"""Delight"" says it all",This is a confection that has been around a fe...,4
3,Cough Medicine,If you are looking for the secret ingredient i...,2
4,Great taffy,Great taffy at a great price. There was a wid...,5


Normalize word formats.

In [5]:
# Convert 'Summary' and 'Text' columns to lowercase
df['Summary'] = df['Summary'].str.lower()
df['Text'] = df['Text'].str.lower()

# Display the first few rows to verify the change
print("\nFirst 5 Rows of the DataFrame after converting text to lowercase:")
display(df.head())


First 5 Rows of the DataFrame after converting text to lowercase:


Unnamed: 0,Summary,Text,Score
0,good quality dog food,i have bought several of the vitality canned d...,5
1,not as advertised,product arrived labeled as jumbo salted peanut...,1
2,"""delight"" says it all",this is a confection that has been around a fe...,4
3,cough medicine,if you are looking for the secret ingredient i...,2
4,great taffy,great taffy at a great price. there was a wid...,5


Removing noisy symbols from text.

In [6]:
import re
import string

def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply the cleaning function to 'Summary' and 'Text' columns
df['Summary'] = df['Summary'].apply(clean_text)
df['Text'] = df['Text'].apply(clean_text)

# Display the first few rows to verify the changes
print("\nFirst 5 Rows of the DataFrame after text cleaning:")
display(df.head(10))


First 5 Rows of the DataFrame after text cleaning:


Unnamed: 0,Summary,Text,Score
0,good quality dog food,i have bought several of the vitality canned d...,5
1,not as advertised,product arrived labeled as jumbo salted peanut...,1
2,delight says it all,this is a confection that has been around a fe...,4
3,cough medicine,if you are looking for the secret ingredient i...,2
4,great taffy,great taffy at a great price there was a wide ...,5
5,nice taffy,i got a wild hair for taffy and ordered this f...,4
6,great just as good as the expensive brands,this saltwater taffy had great flavors and was...,5
7,wonderful tasty taffy,this taffy is so good it is very soft and chew...,5
8,yay barley,right now im mostly just sprouting this so my ...,5
9,healthy dog food,this is a very healthy dog food good for their...,5


In [7]:
display(df.head(20))

Unnamed: 0,Summary,Text,Score
0,good quality dog food,i have bought several of the vitality canned d...,5
1,not as advertised,product arrived labeled as jumbo salted peanut...,1
2,delight says it all,this is a confection that has been around a fe...,4
3,cough medicine,if you are looking for the secret ingredient i...,2
4,great taffy,great taffy at a great price there was a wide ...,5
5,nice taffy,i got a wild hair for taffy and ordered this f...,4
6,great just as good as the expensive brands,this saltwater taffy had great flavors and was...,5
7,wonderful tasty taffy,this taffy is so good it is very soft and chew...,5
8,yay barley,right now im mostly just sprouting this so my ...,5
9,healthy dog food,this is a very healthy dog food good for their...,5


Removing non-informative words.

In [9]:

!pip install nltk
import nltk
from nltk.corpus import stopwords
# Download the stopwords dataset
try:
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    print("NLTK stopwords downloaded successfully!")
except Exception as e:
    print(f"Error downloading NLTK stopwords: {e}")
def remove_stopwords(text):
    if not isinstance(text, str):  # Handle non-string values
        return ''
    return ' '.join([word for word in text.split() if word.lower() not in stop_words])
# Apply the function to 'Summary' and 'Text' columns
df['Summary'] = df['Summary'].apply(remove_stopwords)
df['Text'] = df['Text'].apply(remove_stopwords)
# Display the first few rows to verify the changes
print("\nFirst 5 Rows of the DataFrame after removing stopwords:")
display(df.head())


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------- ----- 1.3/1.5 MB 11.1 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 9.4 MB/s  0:00:00
Installing collected packages: nltk
Successfully installed nltk-3.9.2


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vedant\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


NLTK stopwords downloaded successfully!

First 5 Rows of the DataFrame after removing stopwords:


Unnamed: 0,Summary,Text,Score
0,good quality dog food,bought several vitality canned dog food produc...,5
1,advertised,product arrived labeled jumbo salted peanutsth...,1
2,delight says,confection around centuries light pillowy citr...,4
3,cough medicine,looking secret ingredient robitussin believe f...,2
4,great taffy,great taffy great price wide assortment yummy ...,5


Reduce words to base form.

In [16]:
# Install spaCy and download the English model
!pip install -U spacy
!python -m spacy download en_core_web_sm




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
     - 0 bytes ? 0:00:00
     - 0 bytes ? 0:00:00



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Wheel 'en-core-web-sm' located at C:\Users\Vedant\AppData\Local\Temp\pip-unpack-n53ds9n7\en_core_web_sm-3.8.0-py3-none-any.whl is invalid.


In [None]:
import spacy
import pandas as pd

# Load the English language model
try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    # If model is not found, download it
    import subprocess
    subprocess.run(['python', '-m', 'spacy', 'download', 'en_core_web_sm'], check=True)
    nlp = spacy.load('en_core_web_sm')

def lemmatize_with_spacy(text):
    if not isinstance(text, str) or not text.strip():
        return ''
    
    # Process the text with spaCy
    doc = nlp(text.lower())
    
    # Extract lemmas and filter out stopwords and punctuation
    lemmas = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    
    return ' '.join(lemmas)


# Apply to a small sample of your dataframe
df_sample = df.head().copy()
df_sample['Text_Lemmatized'] = df_sample['Text'].astype(str).apply(lemmatize_with_spacy)
display(df_sample[['Text', 'Text_Lemmatized']])

Unnamed: 0,Text,Text_Lemmatized
0,bought several vitality canned dog food produc...,buy vitality can dog food product find good qu...
1,product arrived labeled jumbo salted peanutsth...,product arrive label jumbo salt peanutsthe pea...
2,confection around centuries light pillowy citr...,confection century light pillowy citrus gelati...
3,looking secret ingredient robitussin believe f...,look secret ingredient robitussin believe find...
4,great taffy great price wide assortment yummy ...,great taffy great price wide assortment yummy ...


In [20]:
# First, let's process the data in batches to avoid memory issues
def process_in_batches(df, column, batch_size=1000):
    """
    Process a dataframe column in batches using spaCy's nlp.pipe for better performance
    """
    processed_texts = []
    
    # Process in batches
    for doc in nlp.pipe(df[column].astype(str), batch_size=batch_size, n_process=-1):
        # Get lemmas, remove stopwords and punctuation
        lemmas = [token.lemma_.lower() for token in doc 
                 if not token.is_stop and not token.is_punct and not token.is_space]
        processed_texts.append(' '.join(lemmas))
    
    return processed_texts

# Apply to both 'Summary' and 'Text' columns
print("Processing 'Summary' column...")
df['Summary_Lemmatized'] = process_in_batches(df, 'Summary')

print("Processing 'Text' column...")
df['Text_Lemmatized'] = process_in_batches(df, 'Text')

# Display the results
print("\nProcessing complete! Here are the first 5 rows with lemmatized text:")
display(df[['Summary', 'Summary_Lemmatized', 'Text', 'Text_Lemmatized']].head())

# Optional: Save the processed data to a new CSV file
# df.to_csv('amazon_reviews_lemmatized.csv', index=False) 

Processing 'Summary' column...
Processing 'Text' column...

Processing complete! Here are the first 5 rows with lemmatized text:


Unnamed: 0,Summary,Summary_Lemmatized,Text,Text_Lemmatized
0,good quality dog food,good quality dog food,bought several vitality canned dog food produc...,buy vitality can dog food product find good qu...
1,advertised,advertise,product arrived labeled jumbo salted peanutsth...,product arrive label jumbo salt peanutsthe pea...
2,delight says,delight say,confection around centuries light pillowy citr...,confection century light pillowy citrus gelati...
3,cough medicine,cough medicine,looking secret ingredient robitussin believe f...,look secret ingredient robitussin believe find...
4,great taffy,great taffy,great taffy great price wide assortment yummy ...,great taffy great price wide assortment yummy ...
