<a href="https://colab.research.google.com/github/Cutie-tee/Roboreviews_project/blob/main/reviews_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dataset consists of 3 files: 1429_1.csv
Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv
Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv




In [5]:
!pip install --upgrade pandas




In [6]:
import pandas as pd

def safe_read_csv(file_path):
    """
    Safely reads a CSV file by handling parsing issues.
    """
    try:
        return pd.read_csv(file_path, low_memory=False, on_bad_lines='skip', quotechar='"', escapechar='\\')
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return None

# Load datasets safely
file1_data = safe_read_csv('1429_1.csv')
file2_data = safe_read_csv('Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv')
file3_data = safe_read_csv('Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv')

# Check if all files loaded successfully
if file1_data is not None and file2_data is not None and file3_data is not None:
    # Standardizing column names
    file1_data.rename(columns=lambda x: x.strip(), inplace=True)
    file2_data.rename(columns=lambda x: x.strip(), inplace=True)
    file3_data.rename(columns=lambda x: x.strip(), inplace=True)

    # Align datasets to common columns
    common_columns = list(set(file1_data.columns) & set(file2_data.columns) & set(file3_data.columns))

    # Selecting only common columns
    file1_data = file1_data[common_columns]
    file2_data = file2_data[common_columns]
    file3_data = file3_data[common_columns]

    # Concatenate datasets
    combined_data = pd.concat([file1_data, file2_data, file3_data], ignore_index=True)

    # Dropping duplicates
    combined_data.drop_duplicates(inplace=True)

    # Resetting index
    combined_data.reset_index(drop=True, inplace=True)

    # Save cleaned dataset
    combined_data.to_csv('combined_reviews_cleaned.csv', index=False)

    # Display overview
    print("Dataset successfully cleaned and saved.")
    print(combined_data.info())
else:
    print("One or more files could not be loaded.")




Dataset successfully cleaned and saved.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67351 entries, 0 to 67350
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   reviews.title        67332 non-null  object 
 1   reviews.sourceURLs   67351 non-null  object 
 2   asins                67349 non-null  object 
 3   reviews.text         67350 non-null  object 
 4   manufacturer         67351 non-null  object 
 5   reviews.date         67312 non-null  object 
 6   categories           67351 non-null  object 
 7   brand                67351 non-null  object 
 8   id                   67351 non-null  object 
 9   reviews.rating       67318 non-null  float64
 10  keys                 67351 non-null  object 
 11  reviews.id           71 non-null     float64
 12  reviews.dateSeen     67351 non-null  object 
 13  reviews.username     67339 non-null  object 
 14  reviews.doRecommend  54511 non-null  object 
 

In [7]:
import pandas as pd

def debug_csv_in_chunks(file_path, chunk_size=1000):
    """
    Debug a CSV file in chunks to identify problematic rows.
    """
    problematic_chunks = []
    chunk_number = 0

    try:
        for chunk in pd.read_csv(
            file_path, chunksize=chunk_size, low_memory=False, encoding='utf-8', on_bad_lines='skip'
        ):
            chunk_number += 1
            # Check if the chunk is read successfully
            chunk.head()  # Access the chunk to validate
    except Exception as e:
        problematic_chunks.append((chunk_number, str(e)))
        print(f"Error in chunk {chunk_number}: {e}")

    return problematic_chunks

# Path to the problematic file
problematic_file_path = 'Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv'

# Run the debugging process
problematic_chunks = debug_csv_in_chunks(problematic_file_path)
print("Problematic chunks identified:", problematic_chunks)

def handle_bad_lines(bad_line):
    """
    Custom error handler for bad lines in the CSV.
    Attempts to fix string enclosure issues.
    """
    try:
        # Assume the issue is with unescaped quotes within a string field
        # This is a simple example, you may need more robust logic based on your data
        bad_line = bad_line.replace('"', '\\"')
        return bad_line
    except Exception as e:
        print(f"Error handling bad line: {e}, Line: {bad_line}")
        return None  # Or choose to skip the line entirely


# Read the file with skipping problematic lines and applying custom error handling

cleaned_data = pd.read_csv(problematic_file_path, encoding='utf-8', on_bad_lines=handle_bad_lines, engine='python')

# Save the cleaned file
cleaned_data.to_csv('Datafiniti_Cleaned_May19.csv', index=False)
print("Cleaned data saved as 'Datafiniti_Cleaned_May19.csv'")


Problematic chunks identified: []
Cleaned data saved as 'Datafiniti_Cleaned_May19.csv'


Next Steps:
Now that the problematic file has been cleaned and saved as Datafiniti_Cleaned_May19.csv, l

1.  inspect the cleaned file to ensure it's ready for analysis:
Check for missing values.
Display an overview of the data.

2. Merge Cleaned Data with Other Files
Combine the cleaned file with the previously processed datasets to create a unified dataset.

3. Sentiment Classification and Clustering


In [8]:
import pandas as pd

# Paths to datasets
file1_path = '1429_1.csv'
file2_path = 'Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv'
cleaned_file_path = 'Datafiniti_Cleaned_May19.csv'

# Load datasets
try:
    file1_data = pd.read_csv(file1_path, encoding='utf-8', on_bad_lines='skip')
    file2_data = pd.read_csv(file2_path, encoding='utf-8', on_bad_lines='skip')
    cleaned_data = pd.read_csv(cleaned_file_path, encoding='utf-8')

    # Standardize column names across datasets
    file1_data.rename(columns=lambda x: x.strip(), inplace=True)
    file2_data.rename(columns=lambda x: x.strip(), inplace=True)
    cleaned_data.rename(columns=lambda x: x.strip(), inplace=True)

    # Find common columns for merging
    common_columns = list(set(file1_data.columns) & set(file2_data.columns) & set(cleaned_data.columns))

    # Select only common columns
    file1_data = file1_data[common_columns]
    file2_data = file2_data[common_columns]
    cleaned_data = cleaned_data[common_columns]

    # Concatenate datasets
    combined_data = pd.concat([file1_data, file2_data, cleaned_data], ignore_index=True)

    # Drop duplicates
    combined_data.drop_duplicates(inplace=True)

    # Reset index
    combined_data.reset_index(drop=True, inplace=True)

    # Save the merged dataset
    combined_data.to_csv('Merged_Reviews_Dataset.csv', index=False)

    print("Merged dataset saved as 'Merged_Reviews_Dataset.csv'")

except Exception as e:
    print(f"An error occurred: {e}")


  file1_data = pd.read_csv(file1_path, encoding='utf-8', on_bad_lines='skip')


Merged dataset saved as 'Merged_Reviews_Dataset.csv'


**Sentinent analysis**

In [9]:
# Load dataset with low_memory=False
combined_data = pd.read_csv("Merged_Reviews_Dataset.csv", low_memory=False)


In [11]:
#Goal is to classify customer reviews into three sentiment categories
from transformers import pipeline
from tqdm import tqdm
import pandas as pd

# Load dataset with low_memory=False
combined_data = pd.read_csv("Merged_Reviews_Dataset.csv", low_memory=False)

# Load sentiment analysis pipeline with GPU acceleration
sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    device=0
)

# Define a batch size for processing
BATCH_SIZE = 64

# Drop missing reviews
combined_data = combined_data.dropna(subset=['reviews.text'])

# Function to truncate reviews to 512 tokens
def truncate_text(text, max_length=512):
    return text[:max_length]

# Apply truncation to the review text
combined_data['reviews.text'] = combined_data['reviews.text'].apply(lambda x: truncate_text(x, max_length=512))

# Perform sentiment analysis in batches
sentiments = []
for i in tqdm(range(0, len(combined_data), BATCH_SIZE), desc="Processing Batches"):
    batch = combined_data['reviews.text'].iloc[i:i + BATCH_SIZE].tolist()
    results = sentiment_analyzer(batch)
    sentiments.extend([result['label'] for result in results])

# Add sentiment results to the DataFrame
combined_data['Sentiment'] = sentiments

# Save the dataset with sentiment labels
combined_data.to_csv("Dataset_with_Sentiment.csv", index=False)
print("Sentiment classification complete. Dataset saved as 'Dataset_with_Sentiment.csv'")




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_data['reviews.text'] = combined_data['reviews.text'].apply(lambda x: truncate_text(x, max_length=512))
Processing Batches: 100%|██████████| 1053/1053 [06:28<00:00,  2.71it/s]


Sentiment classification complete. Dataset saved as 'Dataset_with_Sentiment.csv'
