## Parallel Logistic Regression Model for Spam Detection of Amazon "Sports and Outdoors" Product Reviews

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack
import multiprocessing as mp
import time


In [2]:
import warnings

# Suppress all FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
# Load dataset
data = pd.read_json('~/Documents/Sports_and_Outdoors/Sports_and_Outdoors.json', lines=True)
data.head()

Unnamed: 0,_id,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,category,class
0,{'$oid': '5a132768741a2384e847b8ed'},A2PAVURT4NOHE1,31852,Leah,"[0, 0]",Bought it for a ballet tutu but it is being wo...,5,Super cute,1388361600,"12 30, 2013",Sports_and_Outdoors,1
1,{'$oid': '5a132768741a2384e847b8e9'},A1SNLWGLFXD70K,31852,DEVA,"[0, 0]",I origonally didn't get the item I ordered. W...,4,Happy with purchase even though it came a lot ...,1392940800,"02 21, 2014",Sports_and_Outdoors,1
2,{'$oid': '5a132768741a2384e847b8ee'},A3URQ0LXLV46E9,31852,shortyvee,"[0, 0]",My daughter and her friends love the colors an...,4,zebralisous,1400544000,"05 20, 2014",Sports_and_Outdoors,1
3,{'$oid': '5a132768741a2384e847b8ea'},A1KJ4CVG87QW09,31852,Donna Carter-Scott,"[0, 0]","Arrived very timely, cute grandbaby loves it. ...",4,Cute Tutu,1389657600,"01 14, 2014",Sports_and_Outdoors,1
4,{'$oid': '5a132768741a2384e847b8eb'},AA9ITO6ZLZW6,31852,Jazzy77,"[0, 0]",My little girl just loves to wear this tutu be...,5,Versatile,1399507200,"05 8, 2014",Sports_and_Outdoors,1


Here, the input features will be: `reviewText`, `overall`, `summary`, and `helpful`
The predictor will be `class`, which indicates whether the review is spam (1), or not spam (0)

The first element of the `helpful` feature is extracted, indicating the number of users that found that review helpful.

In [4]:
# Extract the relevant columns
data = data[['reviewText', 'overall', 'summary', 'helpful', 'class']]

# Clean the 'helpful' column: extract the first element of the list - num of helpful votes
data['helpful'] = data['helpful'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else 0)

# Check cleaned data
data.head()

Unnamed: 0,reviewText,overall,summary,helpful,class
0,Bought it for a ballet tutu but it is being wo...,5,Super cute,0,1
1,I origonally didn't get the item I ordered. W...,4,Happy with purchase even though it came a lot ...,0,1
2,My daughter and her friends love the colors an...,4,zebralisous,0,1
3,"Arrived very timely, cute grandbaby loves it. ...",4,Cute Tutu,0,1
4,My little girl just loves to wear this tutu be...,5,Versatile,0,1


In [5]:
# Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(data[['reviewText', 'overall', 'summary', 'helpful']], 
                                                    data['class'], test_size=0.2, random_state=42, shuffle=True)

We must convert text features (`reviewText` & `summary`) into numerical vectors suitable for ML training.

In [6]:
start_time = time.time()

# Initialize the TF-IDF Vectorizer for 'reviewText' and 'summary'
# Fit and transform the 'reviewText' and 'summary'

vectorizer_review = TfidfVectorizer(max_features=5000)
X_train_review_tfidf = vectorizer_review.fit_transform(X_train['reviewText'])
X_test_review_tfidf = vectorizer_review.transform(X_test['reviewText'])


vectorizer_summary = TfidfVectorizer(max_features=1000)
X_train_summary_tfidf = vectorizer_summary.fit_transform(X_train['summary'])
X_test_summary_tfidf = vectorizer_summary.transform(X_test['summary'])


# Standardize the numerical features ('overall' and 'helpful')
scaler = StandardScaler()
X_train_overall_helpful = scaler.fit_transform(X_train[['overall', 'helpful']])
X_test_overall_helpful = scaler.transform(X_test[['overall', 'helpful']])

# Stop timer
end_time = time.time()
print(f"Sequential preprocessing completed in: {end_time - start_time:.2f} seconds")



Sequential preprocessing completed in: 106.79 seconds


In [7]:

# Check the shapes of each feature set to ensure consistency
print(f"Shape of X_train_review_tfidf: {X_train_review_tfidf.shape}")
print(f"Shape of X_train_summary_tfidf: {X_train_summary_tfidf.shape}")
print(f"Shape of X_train_overall_helpful: {X_train_overall_helpful.shape}")

# Combine all features into one training and testing set
X_train_combined = hstack([X_train_review_tfidf, X_train_summary_tfidf, X_train_overall_helpful])
X_test_combined = hstack([X_test_review_tfidf, X_test_summary_tfidf, X_test_overall_helpful])

# Check the final shapes
print(f"Shape of X_train_combined: {X_train_combined.shape}")
print(f"Shape of y_train: {y_train.shape}")

Shape of X_train_review_tfidf: (2410604, 5000)
Shape of X_train_summary_tfidf: (2410604, 1000)
Shape of X_train_overall_helpful: (2410604, 2)
Shape of X_train_combined: (2410604, 6002)
Shape of y_train: (2410604,)


In [8]:
# Convert the combined matrix to CSR format for slicing
X_train_combined = X_train_combined.tocsr()

Here, we wil define and split the data into `num_chunks` chunks, where `num_chunks` can be equal to the number of CPU cores or be manually chosen.

In [9]:
# Define number of data chunks
num_chunks = mp.cpu_count()
print(f"Number of chunks utilized: {num_chunks}")

# Calculate chunk size
chunk_size = X_train_combined.shape[0] // num_chunks

# Ensure the last chunk includes all remaining rows if the split is not even
X_train_chunks = [X_train_combined[i*chunk_size:(i+1)*chunk_size] for i in range(num_chunks-1)]
y_train_chunks = [y_train[i*chunk_size:(i+1)*chunk_size] for i in range(num_chunks-1)]

# Add remaining rows in the last chunk
X_train_chunks.append(X_train_combined[(num_chunks-1)*chunk_size:])
y_train_chunks.append(y_train[(num_chunks-1)*chunk_size:])

Number of chunks utilized: 16


The defined function below will train the Logistic Regression model on each 'chunk' of the data in parallel.

In [10]:
# Define a function to train Logistic Regression on a chunk of data
def train_on_chunk(X_chunk, y_chunk):
    # Ensure data is writable
    X_chunk = X_chunk.copy()
    y_chunk = y_chunk.copy()

    model = LogisticRegression(max_iter=1000, solver='lbfgs')
    model.fit(X_chunk, y_chunk)
    return model

Below, we train the the model in parallel, and time the training time. 

In [11]:
start_time = time.time()


# Create a multiprocessing pool
with mp.Pool(processes=num_chunks) as pool:
    # Train models in parallel on each chunk of data
    models = pool.starmap(train_on_chunk, zip(X_train_chunks, y_train_chunks))

end_time = time.time()

In [12]:
# Calculate the parallel training time
parallel_training_time = end_time - start_time
print(f"Parallel Training Time: {parallel_training_time:.2f} seconds")

Parallel Training Time: 8.19 seconds


: 