In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset (assuming it's in CSV format)
file_path = '/content/neutrosophic_reviews.txt'

# Parse the text file into a DataFrame
reviews = []
with open(file_path, 'r') as file:
    lines = file.readlines()
    for line in lines:
        parts = line.strip().split(',')
        review_id = int(parts[0])
        truth = float(parts[1])
        indeterminacy = float(parts[2])
        falsity = float(parts[3])
        reviews.append({
            'id': review_id,
            'truth': truth,
            'indeterminacy': indeterminacy,
            'falsity': falsity
        })

# Convert the list of dictionaries to a DataFrame
reviews_df = pd.DataFrame(reviews)

# Split the data into training and optimization sets
train_data, optimization_data = train_test_split(reviews_df, test_size=0.3, random_state=42)

# Display the size of each dataset
print(f'Training data size: {len(train_data)}')
print(f'Optimization data size: {len(optimization_data)}')

# Save the split datasets to CSV files (optional)
train_data.to_csv('/content/train_data.csv', index=False)
optimization_data.to_csv('/content/optimization_data.csv', index=False)


In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Load the dataset
file_path = '/content/amazon_uk_shoes_products_dataset_2021_12.csv'
data = pd.read_csv(file_path)

In [5]:
print(data.head())
print(data.info())

                                      url  \
0  https://www.amazon.co.uk/dp/B07SBX32T5   
1  https://www.amazon.co.uk/dp/B07SBX32T5   
2  https://www.amazon.co.uk/dp/B07SBX32T5   
3  https://www.amazon.co.uk/dp/B07SBX32T5   
4  https://www.amazon.co.uk/dp/B08SW434MG   

                                        product_name     reviewer_name  \
0  Klasified Women's Transparent Clear Sneaker Sh...  Jocelyn McSayles   
1  Klasified Women's Transparent Clear Sneaker Sh...      Kenia Rivera   
2  Klasified Women's Transparent Clear Sneaker Sh...       Chris Souza   
3  Klasified Women's Transparent Clear Sneaker Sh...   Amazon Customer   
4  GUESS Women's Bradly Gymnastics Shoe, White, 7 UK         Graziella   

         review_title                                        review_text  \
0             Love em  Love these. Was looking for converses and thes...   
1  The plastic ripped  The shoes are very cute, but after the 2nd day...   
2        Good quality                                   

In [6]:
columns = ['product_name', 'reviewer_name', 'review_title', 'review_text', 'review_rating',
           'verified_purchase', 'review_date', 'helpful_count', 'uniq_id']
data = data[columns]

In [7]:
# Remove rows with missing values
data.dropna(inplace=True)
# Handle missing values
data['review_title'].fillna('No Title', inplace=True)
data['review_text'].fillna('No Review', inplace=True)
data['helpful_count'].fillna('0 people found this helpful', inplace=True)
data.dropna(subset=['review_text', 'review_rating'], inplace=True)
data.drop_duplicates(subset=['uniq_id'], inplace=True)


In [8]:
import re
def extract_date(review_date):
    match = re.search(r'on (\\d{1,2} \\w+ \\d{4})', review_date)
    if match:
        return pd.to_datetime(match.group(1), format='%d %B %Y')
    return None

data['parsed_date'] = data['review_date'].apply(extract_date)

In [9]:
data.dropna(subset=['review_text', 'review_rating'], inplace=True)
data.drop_duplicates(subset=['uniq_id'], inplace=True)

In [10]:
import csv

input_file = '/content/amazon_uk_shoes_products_dataset_2021_12.csv'

def read_csv(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            yield row

def make_hashable(item):
    if isinstance(item, dict):
        return tuple((k, make_hashable(v)) for k, v in item.items())
    elif isinstance(item, list):
        return tuple(make_hashable(i) for i in item)
    else:
        return item

seen = set()
duplicates = []

for item in read_csv(input_file):
    item_tuple = make_hashable(item)
    if item_tuple in seen:
        duplicates.append(item)
    else:
        seen.add(item_tuple)

if duplicates:
    print(f"Found {len(duplicates)} duplicates in the file.")
else:
    print("No duplicates found in the file.")


No duplicates found in the file.


In [11]:
# Remove rows with NaN values in 'review_text'
data = data.dropna(subset=['review_text'])

# Then proceed with TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(data['review_text'])
# Fill NaN values with an empty string or other placeholder text
data['review_text'].fillna('', inplace=True)

# Then proceed with TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(data['review_text'])


In [12]:
# Encode review text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(data['review_text'])

In [13]:
# Create additional features
data['review_length'] = data['review_text'].apply(len)

In [16]:
import pandas as pd

# Assuming 'data' is your cleaned DataFrame
# Replace 'cleaned_data.csv' with your desired file name and path
file_path = 'cleaned_data.csv'

# Save DataFrame to CSV file
data.to_csv(file_path, index=False)

print(f"Cleaned dataset saved to {file_path}")


Cleaned dataset saved to cleaned_data.csv


In [17]:
# Display basic information about the cleaned dataset
data_info_cleaned = data.info()
data_head_cleaned = data.head()

data_info_cleaned, data_head_cleaned

<class 'pandas.core.frame.DataFrame'>
Index: 1890 entries, 0 to 6816
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   product_name       1890 non-null   object 
 1   reviewer_name      1890 non-null   object 
 2   review_title       1890 non-null   object 
 3   review_text        1890 non-null   object 
 4   review_rating      1890 non-null   float64
 5   verified_purchase  1890 non-null   bool   
 6   review_date        1890 non-null   object 
 7   helpful_count      1890 non-null   object 
 8   uniq_id            1890 non-null   object 
 9   parsed_date        0 non-null      object 
 10  review_length      1890 non-null   int64  
dtypes: bool(1), float64(1), int64(1), object(8)
memory usage: 164.3+ KB


(None,
                                          product_name     reviewer_name  \
 0   Klasified Women's Transparent Clear Sneaker Sh...  Jocelyn McSayles   
 4   GUESS Women's Bradly Gymnastics Shoe, White, 7 UK         Graziella   
 6   GUESS Women's Bradly Gymnastics Shoe, White, 7 UK           Cliente   
 14  adidas Women's Retrorun Shoes Running, Core Bl...           Lindsay   
 17     Aravon Women's Betty-AR Oxfords, Stone, 5.5 UK   Amazon Customer   
 
                    review_title  \
 0                       Love em   
 4                    PERFETTE!!   
 6                   Molto belle   
 14  Perfect right outta the box   
 17   Comfortable and attractive   
 
                                           review_text  review_rating  \
 0   Love these. Was looking for converses and thes...            5.0   
 4   Ho scelto il modello bianco con rifinitura die...            5.0   
 6   Le scarpe sono molto belle, calzano perfettamente            5.0   
 14  True to size. If bet

Now Convert the user-generated content into neutrosophic sets. Each review will be represented as a triplet T(I,F,N) where:
T is the degree of truth (positive feedback).
I is the degree of indeterminacy (neutral or ambiguous feedback).
F is the degree of falsity (negative feedback).

In [18]:
import pandas as pd
import numpy as np
from textblob import TextBlob

# Function to classify review text and rating into neutrosophic sets with sentiment analysis
def classify_review_rating(review_text, review_rating):
    review_rating = float(review_rating)

    # Ensure review_text is a string
    review_text = str(review_text)

    # Perform sentiment analysis on review text
    sentiment = TextBlob(review_text).sentiment
    polarity = sentiment.polarity

    # Define thresholds for sentiment polarity
    positive_threshold = 0.1
    negative_threshold = -0.1

    # Define membership functions for truth (T), indeterminacy (I), and falsity (F)
    if review_rating >= 4 and polarity > positive_threshold:
        return (1.0, 0.0, 0.0)  # High truth, low indeterminacy, low falsity
    elif review_rating <= 2 and polarity < negative_threshold:
        return (0.0, 0.0, 1.0)  # Low truth, low indeterminacy, high falsity
    elif review_rating == 3 or (polarity >= -0.3 and polarity <= 0.3):
        return (0.0, 1.0, 0.0)  # Low truth, high indeterminacy, low falsity
    else:
        return (0.5, 0.5, 0.0)  # Moderate truth, moderate indeterminacy, low falsity

# Load your dataset (assuming 'data' is your DataFrame)
data = pd.read_csv('/content/amazon_uk_shoes_products_dataset_2021_12.csv')

# Apply the classification function with sentiment analysis to each review
data[['truth', 'indeterminacy', 'falsity']] = data.apply(
    lambda row: pd.Series(classify_review_rating(row['review_text'], row['review_rating'])), axis=1)

# Handle contradictory information
def handle_contradictions(truth, indeterminacy, falsity):
    # Example rule: If both truth and falsity are high, adjust indeterminacy
    if truth > 0.5 and falsity > 0.5:
        indeterminacy = max(indeterminacy, 0.5)
    return truth, indeterminacy, falsity

data[['truth', 'indeterminacy', 'falsity']] = data.apply(
    lambda row: pd.Series(handle_contradictions(row['truth'], row['indeterminacy'], row['falsity'])), axis=1)

# Quantify uncertainty (if needed)
def quantify_uncertainty(review_text, review_rating):
    # Define thresholds and probabilistic approaches here if needed
    uncertainty_score = np.random.uniform(0.1, 0.9)
    return uncertainty_score

data['uncertainty'] = data.apply(lambda row: quantify_uncertainty(row['review_text'], row['review_rating']), axis=1)

# Define thresholds for classification
positive_threshold = 0.6
negative_threshold = 0.4

# Function to classify reviews
def classify_reviews(truth, indeterminacy, falsity):
    if truth >= positive_threshold:
        return 'Positive'
    elif falsity >= negative_threshold:
        return 'Negative'
    else:
        return 'Neutral'

# Apply classification function to each review
data['review_sentiment'] = data.apply(lambda row: classify_reviews(row['truth'], row['indeterminacy'], row['falsity']), axis=1)

# Define the output file path as CSV
output_file_path = '/content/neutrosophic_reviews.csv'

# Write reviews and neutrosophic values to the output CSV file
data.to_csv(output_file_path, index=False)

print("Neutrosophic reviews have been written to:", output_file_path)


Neutrosophic reviews have been written to: /content/neutrosophic_reviews.csv


total count of positive, negative and neutral reviews

In [19]:
# Assuming 'data' is your DataFrame containing neutrosophic reviews

# Define thresholds for classification
positive_threshold = 0.6  # Adjust as needed based on your neutrosophic logic framework
negative_threshold = 0.4  # Adjust as needed based on your neutrosophic logic framework

# Function to classify reviews
def classify_reviews(truth, indeterminacy, falsity):
    if truth >= positive_threshold:
        return 'Positive'
    elif falsity >= negative_threshold:
        return 'Negative'
    else:
        return 'Neutral'

# Apply classification function to each review
data['review_sentiment'] = data.apply(lambda row: classify_reviews(row['truth'], row['indeterminacy'], row['falsity']), axis=1)

# Count positive, negative, and neutral reviews
positive_count = data[data['review_sentiment'] == 'Positive'].shape[0]
negative_count = data[data['review_sentiment'] == 'Negative'].shape[0]
neutral_count = data[data['review_sentiment'] == 'Neutral'].shape[0]

print(f"Positive Reviews: {positive_count}")
print(f"Negative Reviews: {negative_count}")
print(f"Neutral Reviews: {neutral_count}")


Positive Reviews: 2870
Negative Reviews: 177
Neutral Reviews: 3776


count of positive, negative, neutral reviews by product

In [20]:
# Assuming 'data' is your DataFrame containing neutrosophic reviews

# Group by product and count positive, negative, and neutral reviews
product_sentiment_counts = data.groupby('product_name')['review_sentiment'].value_counts().unstack(fill_value=0)

# Display the counts
print("Count of Positive, Negative, and Neutral Reviews by Product:")
print(product_sentiment_counts)


Count of Positive, Negative, and Neutral Reviews by Product:
review_sentiment                                    Negative  Neutral  \
product_name                                                            
': 'ZAPATILLA NEW BALANCE KV220, Navy/White, 9....         2        4   
ACE Constructor High S3 Work Boots - Mens Leath...         0       10   
ALLY UNION MAKE FORCE Mens Womens Walking Shoes...         1        2   
ANNE KLEIN Women's Anne Kleon Onthego Sneaker, ...         0        0   
ANNE KLEIN Women's Terri Sneaker, Grey Heathere...         1        5   
...                                                      ...      ...   
victoria Unisex 106500-women Hi-Top Trainers, W...         0        9   
victoria Unisex Adults JUEGOS Slip ON LONA Mult...         0        1   
victoria Unisex Deportivo Laser Estrellas Train...         0        4   
victoria Unisex Kids Deportivo Lurex Trainers, ...         1        5   
victoria Women's Basket Terciopelo Trainers, Gr...         0   

Aggregation Using Neutrosophic Logic

In [21]:
def aggregate_neutrosophic_sets(reviews):
    # Placeholder function to demonstrate structure
    # Assuming 'reviews' is a DataFrame with columns 'truth', 'indeterminacy', 'falsity'

    total_truth = reviews['truth'].mean()
    total_indeterminacy = reviews['indeterminacy'].mean()
    total_falsity = reviews['falsity'].mean()

    return pd.Series({'Aggregated Truth': total_truth, 'Aggregated Indeterminacy': total_indeterminacy, 'Aggregated Falsity': total_falsity})


In [22]:
def aggregate_neutrosophic_sets(group):
    # Calculate the mean of truth, indeterminacy, and falsity for the group
    aggregated_truth = group['truth'].mean()
    aggregated_indeterminacy = group['indeterminacy'].mean()
    aggregated_falsity = group['falsity'].mean()

    # Return a Series with the aggregated values
    return pd.Series({
        'Aggregated Truth': aggregated_truth,
        'Aggregated Indeterminacy': aggregated_indeterminacy,
        'Aggregated Falsity': aggregated_falsity
    })

# Apply the aggregation for each product or vendor
product_aggregates = data.groupby('product_name').apply(aggregate_neutrosophic_sets).reset_index()

# Save the aggregated results
output_aggregate_path = 'aggregated_neutrosophic_reviews.csv'
product_aggregates.to_csv(output_aggregate_path, index=False)
print("Aggregated neutrosophic reviews have been written to:", output_aggregate_path)


Aggregated neutrosophic reviews have been written to: aggregated_neutrosophic_reviews.csv


In [23]:
# Aggregate neutrosophic sets for the entire dataset
total_truth = data['truth'].mean()
total_indeterminacy = data['indeterminacy'].mean()
total_falsity = data['falsity'].mean()

# Output the aggregated results
print(f"Aggregated Truth: {total_truth}")
print(f"Aggregated Indeterminacy: {total_indeterminacy}")
print(f"Aggregated Falsity: {total_falsity}")


# Save the aggregated results to a CSV file
aggregate_results = pd.DataFrame({
    'Aggregated Truth': [total_truth],
    'Aggregated Indeterminacy': [total_indeterminacy],
    'Aggregated Falsity': [total_falsity]
})



Aggregated Truth: 0.42657188919829986
Aggregated Indeterminacy: 0.5474864429136743
Aggregated Falsity: 0.025941667888025793


pso optimization

In [27]:
pip install pyswarm


Collecting pyswarm
  Downloading pyswarm-0.6.tar.gz (4.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyswarm
  Building wheel for pyswarm (setup.py) ... [?25l[?25hdone
  Created wheel for pyswarm: filename=pyswarm-0.6-py3-none-any.whl size=4464 sha256=4f798a65e267162378de447bb601acf44a20676f88dadfdd15348cfa919d3427
  Stored in directory: /root/.cache/pip/wheels/71/67/40/62fa158f497f942277cbab8199b05cb61c571ab324e67ad0d6
Successfully built pyswarm
Installing collected packages: pyswarm
Successfully installed pyswarm-0.6


In [28]:
import pandas as pd
import numpy as np
import random
from pyswarm import pso

# Load the aggregated neutrosophic reviews dataset
aggregated_reviews_df = pd.read_csv('/content/aggregated_neutrosophic_reviews.csv')

# Prepare the reviews
reviews = []
for index, row in aggregated_reviews_df.iterrows():
    reviews.append({
        'id': index,
        'truth': row['Aggregated Truth'],
        'indeterminacy': row['Aggregated Indeterminacy'],
        'falsity': row['Aggregated Falsity']
    })

# Define the customized objective function for PSO
def objective_function(solution, reviews):
    total_error = 0
    weights = [0.6, 0.2, 0.2]  # Custom weights for T, I, F, emphasizing T
    penalty = 0.1  # Penalty factor for indeterminacy (I)

    for review in reviews:
        truth_error = abs(review['truth'] - solution[0])
        indeterminacy_error = abs(review['indeterminacy'] - solution[1])
        falsity_error = abs(review['falsity'] - solution[2])

        total_error += (weights[0] * truth_error +
                        weights[1] * indeterminacy_error +
                        weights[2] * falsity_error +
                        penalty * review['indeterminacy'])

    return total_error

# PSO parameters
num_particles = 50  # Number of particles for better search diversity
num_iterations = 150  # More iterations for better optimization
w = 0.7  # Inertia weight
c1 = 1.5  # Cognitive parameter
c2 = 1.5  # Social parameter

# Initialize particles and velocities
particles = [np.random.rand(3) for _ in range(num_particles)]
velocities = [np.random.rand(3) for _ in range(num_particles)]
personal_best_positions = particles.copy()
personal_best_scores = [objective_function(p, reviews) for p in particles]
global_best_position = personal_best_positions[np.argmin(personal_best_scores)]

# PSO algorithm
for _ in range(num_iterations):
    for i, particle in enumerate(particles):
        # Update velocity
        r1, r2 = random.random(), random.random()
        velocities[i] = (w * velocities[i] +
                         c1 * r1 * (personal_best_positions[i] - particle) +
                         c2 * r2 * (global_best_position - particle))

        # Update position
        particles[i] = particles[i] + velocities[i]

        # Ensure particles stay within bounds [0, 1]
        particles[i] = np.clip(particles[i], 0, 1)

        # Evaluate new position
        current_score = objective_function(particles[i], reviews)
        if current_score < personal_best_scores[i]:  # Minimizing error
            personal_best_positions[i] = particles[i]
            personal_best_scores[i] = current_score

        # Update global best position
        if current_score < min(personal_best_scores):
            global_best_position = particles[i]

print("Optimized Aggregation Method:", global_best_position)

# Apply the optimal weights to evaluate the reviews
optimized_reviews = []
for review in reviews:
    weighted_sum = (global_best_position[0] * review['truth'] +
                    global_best_position[1] * review['indeterminacy'] +
                    global_best_position[2] * review['falsity'])

    optimized_review = review.copy()
    optimized_review['weighted_sum'] = weighted_sum
    optimized_reviews.append(optimized_review)

# Convert to DataFrame for further analysis
optimized_reviews_df = pd.DataFrame(optimized_reviews)
print(optimized_reviews_df)


Optimized Aggregation Method: [0.22053602 0.57753986 0.01154213]
        id  truth  indeterminacy   falsity  weighted_sum
0        0   0.40       0.400000  0.200000      0.321539
1        1   0.00       1.000000  0.000000      0.577540
2        2   0.75       0.150000  0.100000      0.253187
3        3   1.00       0.000000  0.000000      0.220536
4        4   0.40       0.500000  0.100000      0.378139
...    ...    ...            ...       ...           ...
1081  1081   0.10       0.900000  0.000000      0.541839
1082  1082   0.50       0.500000  0.000000      0.399038
1083  1083   0.20       0.800000  0.000000      0.506139
1084  1084   0.00       0.833333  0.166667      0.483207
1085  1085   0.00       1.000000  0.000000      0.577540

[1086 rows x 5 columns]
