# Project Reviews Aggregator

## Import libraries

In [2]:
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from nltk.stem.wordnet import WordNetLemmatizer



## Data cleaning

### Load Kaggle data

In [3]:
# Loading data from kaggle

import kagglehub

# Download latest version
path_kaggle = kagglehub.dataset_download("datafiniti/consumer-reviews-of-amazon-products")

print("Path to dataset files:", path_kaggle)

Path to dataset files: C:\Users\aurel\.cache\kagglehub\datasets\datafiniti\consumer-reviews-of-amazon-products\versions\5


In [4]:
import pandas as pd
import os


# List all CSV files in the directory
csv_files = [file for file in os.listdir(path_kaggle) if file.endswith('.csv')]

# Iterate through each CSV file and print its headers
for file in csv_files:
    file_path = os.path.join(path_kaggle, file)
    df = pd.read_csv(file_path, low_memory=False)
    print(f"Headers for {file}:")
    for header in df.columns:
        print(header)
    print("\n" + "-" * 50 + "\n")  # Separator for better readability


Headers for 1429_1.csv:
id
name
asins
brand
categories
keys
manufacturer
reviews.date
reviews.dateAdded
reviews.dateSeen
reviews.didPurchase
reviews.doRecommend
reviews.id
reviews.numHelpful
reviews.rating
reviews.sourceURLs
reviews.text
reviews.title
reviews.userCity
reviews.userProvince
reviews.username

--------------------------------------------------

Headers for Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv:
id
dateAdded
dateUpdated
name
asins
brand
categories
primaryCategories
imageURLs
keys
manufacturer
manufacturerNumber
reviews.date
reviews.dateAdded
reviews.dateSeen
reviews.doRecommend
reviews.id
reviews.numHelpful
reviews.rating
reviews.sourceURLs
reviews.text
reviews.title
reviews.username
sourceURLs

--------------------------------------------------

Headers for Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv:
id
dateAdded
dateUpdated
name
asins
brand
categories
primaryCategories
imageURLs
keys
manufacturer
manufacturerNumber
reviews.date
rev

In [5]:
######## Create Kaggle dataframe with the 3 review csv ########

"""
# List all CSV files in the directory
csv_files = [file for file in os.listdir(path_kaggle) if file.endswith('.csv')]

# Combine all CSV files into a single DataFrame
dataframes = []
for file in csv_files:
    file_path = os.path.join(path_kaggle, file)
    df = pd.read_csv(file_path, low_memory=False)  # Prevent dtype warnings
    dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame, aligning columns
kag_comb = pd.concat(dataframes, ignore_index=True, sort=True)

# Define the project directory
project_dir = os.getcwd()  # Gets the current working directory

# Path to the "data" folder in the project directory
data_dir = os.path.join(project_dir, "data")

# Ensure the "data" folder exists
os.makedirs(data_dir, exist_ok=True)

# Save the combined DataFrame to a new CSV file in the "data" folder
kag_comb_path = os.path.join(data_dir, "kag_comb.csv")
kag_comb.to_csv(kag_comb_path, index=False)

print(f"Combined CSV saved to {kag_comb_path}")
"""


######## Load generated CSV ########

kag_comb = pd.read_csv("data/kag_comb.csv")


  kag_comb = pd.read_csv("data/kag_comb.csv")


### Clean Kaggle data

In [6]:
#Check df heads

# List of columns to drop
columns_to_drop = [
    'dateAdded',
    'dateUpdated',
    'reviews.didPurchase',
    'reviews.userCity',
    'reviews.userProvince',
    'asins',
    'imageURLs',
    'manufacturerNumber',
    'primaryCategories',
    'sourceURLs',
    'reviews.sourceURLs',
    'keys',
    'reviews.date',
    'reviews.dateAdded',
    'reviews.dateSeen',
    'reviews.numHelpful'
]

# Drop the columns from the dataframe
kag_comb = kag_comb.drop(columns=columns_to_drop)


# Get a list of all column headers
headers = kag_comb.columns.tolist()

# Display the headers
for header in headers:
    print(header)


brand
categories
id
manufacturer
name
reviews.doRecommend
reviews.id
reviews.rating
reviews.text
reviews.title
reviews.username


In [7]:
kag_comb.head()

Unnamed: 0,brand,categories,id,manufacturer,name,reviews.doRecommend,reviews.id,reviews.rating,reviews.text,reviews.title,reviews.username
0,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",AVqkIhwDv8e3D1O-lebb,Amazon,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",True,,5.0,This product so far has not disappointed. My c...,Kindle,Adapter
1,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",AVqkIhwDv8e3D1O-lebb,Amazon,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",True,,5.0,great for beginner or experienced person. Boug...,very fast,truman
2,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",AVqkIhwDv8e3D1O-lebb,Amazon,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",True,,5.0,Inexpensive tablet for him to use and learn on...,Beginner tablet for our 9 year old son.,DaveZ
3,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",AVqkIhwDv8e3D1O-lebb,Amazon,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",True,,4.0,I've had my Fire HD 8 two weeks now and I love...,Good!!!,Shacks
4,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",AVqkIhwDv8e3D1O-lebb,Amazon,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",True,,5.0,I bought this for my grand daughter when she c...,Fantastic Tablet for kids,explore42


In [8]:
# Check for duplicate rows based on 'reviews.text', 'reviews.username', 'reviews.id' columns
duplicates = kag_comb.duplicated(subset=['reviews.text', 'reviews.username', 'reviews.id'])

# Count the number of duplicate rows
num_duplicates = duplicates.sum()
print(f"Number of duplicate rows based on 'reviews.text', 'reviews.username', 'reviews.id': {num_duplicates}")

# If you want to display the duplicate rows
if num_duplicates > 0:
    duplicate_rows = kag_comb[duplicates]
    print("Duplicate rows based on 'reviews.text', 'reviews.username', 'reviews.id':")
    print(duplicate_rows)


Number of duplicate rows based on 'reviews.text', 'reviews.username', 'reviews.id': 19898
Duplicate rows based on 'reviews.text', 'reviews.username', 'reviews.id':
        brand                                         categories  \
34660  Amazon  Computers,Electronics Features,Tablets,Electro...   
34661  Amazon  Computers,Electronics Features,Tablets,Electro...   
34665  Amazon  Computers,Electronics Features,Tablets,Electro...   
34667  Amazon  Computers,Electronics Features,Tablets,Electro...   
34670  Amazon  Computers,Electronics Features,Tablets,Electro...   
...       ...                                                ...   
67987  Amazon  Fire Tablets,Tablets,All Tablets,Amazon Tablet...   
67988  Amazon  Fire Tablets,Tablets,All Tablets,Amazon Tablet...   
67989  Amazon  Fire Tablets,Tablets,All Tablets,Amazon Tablet...   
67990  Amazon  Fire Tablets,Tablets,All Tablets,Amazon Tablet...   
67991  Amazon  Fire Tablets,Tablets,All Tablets,Amazon Tablet...   

                   

In [9]:
# Create a new dataframe with duplicates removed based on 'reviews.text', 'reviews.username' and 'id'
kag_comb_clean = kag_comb.drop_duplicates(subset=['reviews.text', 'reviews.username', 'reviews.id'])

# Remove rows with NaN in 'reviews.rating'
kag_comb_clean = kag_comb_clean.dropna(subset=['reviews.rating'])

# Remove rows with NaN in 'reviews.title'
kag_comb_clean = kag_comb_clean.dropna(subset=['reviews.title'])

# Verify the new dataframe
print(f"New dataframe shape: {kag_comb_clean.shape}")


New dataframe shape: (48044, 11)


In [10]:
# Check for NaN values in the 'reviews.ratingtext' column
nan_reviews_ratingtext = kag_comb_clean['reviews.rating'].isna()

# Count the number of NaN values
num_nan_reviews_ratingtext = nan_reviews_ratingtext.sum()
print(f"Number of NaN values in 'reviews.rating': {num_nan_reviews_ratingtext}")

# Display rows with NaN values in 'reviews.ratingtext'
if num_nan_reviews_ratingtext > 0:
    rows_with_nan = kag_comb_clean[nan_reviews_ratingtext]
    print("Rows with NaN in 'reviews.rating':")
    print(rows_with_nan)

Number of NaN values in 'reviews.rating': 0


### Add missing data for classification training

In [11]:
# Fill NaN values in 'reviews.doRecommend' based on 'reviews.rating'
kag_comb_clean['reviews.doRecommend'] = kag_comb_clean['reviews.doRecommend'].fillna(
    kag_comb_clean['reviews.rating'] > 3
)

# Verify the changes
print(kag_comb_clean['reviews.doRecommend'].isna().sum())

0


  kag_comb_clean['reviews.doRecommend'] = kag_comb_clean['reviews.doRecommend'].fillna(


In [12]:
# Check for NaN values in the 'reviews.doRecommend' column
nan_reviews_text = kag_comb_clean['reviews.doRecommend'].isna()

# Count the number of NaN values
num_nan_reviews_text = nan_reviews_text.sum()
print(f"Number of NaN values in 'reviews.doRecommend': {num_nan_reviews_text}")

# Display rows with NaN values in 'reviews.text'
if num_nan_reviews_text > 0:
    rows_with_nan = kag_comb_clean[nan_reviews_text]
    print("Rows with NaN in 'reviews.doRecommend':")
    print(rows_with_nan)

Number of NaN values in 'reviews.doRecommend': 0


## Preprocessing for sentiment training

### Adding sentiment traning targets

In [13]:
def classify_sentiment(rating):
    if rating <= 2:
        return "negative"
    elif rating == 3:
        return "neutral"
    else:
        return "positive"

kag_comb_clean['sentiment'] = kag_comb_clean['reviews.rating'].apply(classify_sentiment)

kag_comb_clean.head()

Unnamed: 0,brand,categories,id,manufacturer,name,reviews.doRecommend,reviews.id,reviews.rating,reviews.text,reviews.title,reviews.username,sentiment
0,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",AVqkIhwDv8e3D1O-lebb,Amazon,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",True,,5.0,This product so far has not disappointed. My c...,Kindle,Adapter,positive
1,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",AVqkIhwDv8e3D1O-lebb,Amazon,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",True,,5.0,great for beginner or experienced person. Boug...,very fast,truman,positive
2,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",AVqkIhwDv8e3D1O-lebb,Amazon,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",True,,5.0,Inexpensive tablet for him to use and learn on...,Beginner tablet for our 9 year old son.,DaveZ,positive
3,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",AVqkIhwDv8e3D1O-lebb,Amazon,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",True,,4.0,I've had my Fire HD 8 two weeks now and I love...,Good!!!,Shacks,positive
4,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",AVqkIhwDv8e3D1O-lebb,Amazon,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",True,,5.0,I bought this for my grand daughter when she c...,Fantastic Tablet for kids,explore42,positive


### Split into test and train

In [14]:
""" DROPPED

# Combining 'reviews.text' and 'reviews.title' into a single column
kag_comb_clean['combined_reviews'] = kag_comb_clean['reviews.text'].fillna('') + ' ' + kag_comb_clean['reviews.title'].fillna('')

"""

" DROPPED\n\n# Combining 'reviews.text' and 'reviews.title' into a single column\nkag_comb_clean['combined_reviews'] = kag_comb_clean['reviews.text'].fillna('') + ' ' + kag_comb_clean['reviews.title'].fillna('')\n\n"

In [15]:
# FEATURES AND TARGET DEFINITION

# Features
X = kag_comb_clean[['reviews.text', 'reviews.title']]
## X = kag_comb_clean['combined_reviews']


# Targets
y_sentiment = kag_comb_clean['sentiment']  # For sentiment analysis
y_recommend = kag_comb_clean['reviews.doRecommend']  # For recommendation prediction

In [16]:
# Split the dataset into training and testing sets
X_train, X_test, y_train_sentiment, y_test_sentiment = train_test_split(
    X, y_sentiment, test_size=0.2, random_state=42, stratify=y_sentiment
)

X_train, X_test, y_train_recommend, y_test_recommend = train_test_split(
    X, y_recommend, test_size=0.2, random_state=42
)

In [17]:
# Verify the split
print("Training data shape:", X_train.shape)
print("Test data shape:", X_test.shape)
print("Training sentiment target shape:", y_train_sentiment.shape)
print("Test sentiment target shape:", y_test_sentiment.shape)
print("Training recommend target shape:", y_train_recommend.shape)
print("Test recommend target shape:", y_test_recommend.shape)

Training data shape: (38435, 2)
Test data shape: (9609, 2)
Training sentiment target shape: (38435,)
Test sentiment target shape: (9609,)
Training recommend target shape: (38435,)
Test recommend target shape: (9609,)


In [18]:
print(X_train)

                                            reviews.text  \
44470                                            Perfect   
16574  It's a great tablet for a destructive 10 year ...   
17157  Right after we bought it the charging port bro...   
24097  This speaker is fun and intuitive. Alexia keep...   
43584  Does the job but I would like to be able to re...   
...                                                  ...   
11308  Price was reasonable and it is easy to use. It...   
48743                 Excellent batteries for the price!   
41771                                              GREAT   
860    I traded this in after receiving a 7" at Chris...   
15824  My daugther used to have a leap frog tablet, b...   

                      reviews.title  
44470                    Good value  
16574                   Good tablet  
17157  Good warranty and protection  
24097                           Fun  
43584                    Four Stars  
...                             ...  
11308        

### Functions for text cleaning

In [19]:
# Clean text for titles and comments function

import re

def clean_artefacts(sentence):
    """
    Cleans a single sentence by removing artefacts and unwanted characters.
    """
    if not isinstance(sentence, str):
        sentence = str(sentence)  # Ensure it's a string
    
    # Remove JS/CSS
    sentence = re.sub(
        r'<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>|<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>',
        '',
        sentence,
    )

    # Remove HTML comments
    sentence = re.sub(r'<!--.*?-->', '', sentence)

    # Remove HTML tags
    sentence = re.sub(r'<[^>]+>', '', sentence)

    # Remove all special characters except spaces
    sentence = re.sub(r'[^a-zA-Z\s]', ' ', sentence)

    # Remove numbers
    sentence = re.sub(r'\d+', ' ', sentence)

    # Remove all single characters
    sentence = re.sub(r'\b[a-zA-Z]\b', ' ', sentence)

    # Substitute multiple spaces with a single space
    sentence = re.sub(r'\s+', ' ', sentence).strip()

    # Convert to lowercase
    sentence = sentence.lower()

    return sentence



In [20]:
# Replace NaN values with empty strings in X_train and X_test
X_train['reviews.text'] = X_train['reviews.text'].fillna("").astype(str)
X_train['reviews.title'] = X_train['reviews.title'].fillna("").astype(str)

X_test['reviews.text'] = X_test['reviews.text'].fillna("").astype(str)
X_test['reviews.title'] = X_test['reviews.title'].fillna("").astype(str)

# Apply preprocessing to the text and title columns
X_train['reviews.text'] = X_train['reviews.text'].apply(clean_artefacts)
X_train['reviews.title'] = X_train['reviews.title'].apply(clean_artefacts)

X_test['reviews.text'] = X_test['reviews.text'].apply(clean_artefacts)
X_test['reviews.title'] = X_test['reviews.title'].apply(clean_artefacts)

# Verify the results
print("Preprocessed X_train sample:")
print(X_train.head())

print("Preprocessed X_test sample:")
print(X_test.head())


Preprocessed X_train sample:
                                            reviews.text  \
44470                                            perfect   
16574  it great tablet for destructive year old it st...   
17157  right after we bought it the charging port bro...   
24097  this speaker is fun and intuitive alexia keep ...   
43584  does the job but would like to be able to rech...   

                      reviews.title  
44470                    good value  
16574                   good tablet  
17157  good warranty and protection  
24097                           fun  
43584                    four stars  
Preprocessed X_test sample:
                                            reviews.text  \
50163  lasts long good price and keeps my kids toys r...   
15220  bought the tablet and it is very nice really l...   
11996  battery life great cameras touring aps music a...   
47742  needed some and the price is good liked the pa...   
44437  they work well so far cheaper than other batte.

In [21]:
# Verify the preprocessing
X_train.head()

print("Preprocessed X_test sample:")
X_test.head()

Preprocessed X_test sample:


Unnamed: 0,reviews.text,reviews.title
50163,lasts long good price and keeps my kids toys r...,good price and keeps my kids toys running
15220,bought the tablet and it is very nice really l...,alright to do amazon or play games watch videos
11996,battery life great cameras touring aps music a...,supports gps maps apps great add for mc use
47742,needed some and the price is good liked the pa...,work as well as other brands so
44437,they work well so far cheaper than other batte...,good so far


In [22]:
# Word splitter

def split_sentences_to_words(sentences_list):

    if not isinstance(sentences_list, str):
        sentences_list = str(sentences_list)  # Ensure it's a string

    # Split the sentence into words
    return sentences_list.split()

In [23]:
# Apply preprocessing to the text and title columns
X_train['reviews.text'] = X_train['reviews.text'].apply(split_sentences_to_words)
X_train['reviews.title'] = X_train['reviews.title'].apply(split_sentences_to_words)

X_test['reviews.text'] = X_test['reviews.text'].apply(split_sentences_to_words)
X_test['reviews.title'] = X_test['reviews.title'].apply(split_sentences_to_words)

# Verify the results
print("Preprocessed X_train sample:")
print(X_train.head())

print("Preprocessed X_test sample:")
X_test.head()


Preprocessed X_train sample:
                                            reviews.text  \
44470                                          [perfect]   
16574  [it, great, tablet, for, destructive, year, ol...   
17157  [right, after, we, bought, it, the, charging, ...   
24097  [this, speaker, is, fun, and, intuitive, alexi...   
43584  [does, the, job, but, would, like, to, be, abl...   

                           reviews.title  
44470                      [good, value]  
16574                     [good, tablet]  
17157  [good, warranty, and, protection]  
24097                              [fun]  
43584                      [four, stars]  
Preprocessed X_test sample:


Unnamed: 0,reviews.text,reviews.title
50163,"[lasts, long, good, price, and, keeps, my, kid...","[good, price, and, keeps, my, kids, toys, runn..."
15220,"[bought, the, tablet, and, it, is, very, nice,...","[alright, to, do, amazon, or, play, games, wat..."
11996,"[battery, life, great, cameras, touring, aps, ...","[supports, gps, maps, apps, great, add, for, m..."
47742,"[needed, some, and, the, price, is, good, like...","[work, as, well, as, other, brands, so]"
44437,"[they, work, well, so, far, cheaper, than, oth...","[good, so, far]"


In [24]:
# Lemmatize

def lemma_builder(data):
    lemmatizer = WordNetLemmatizer()
    x_lemma = []  # Initialize list to store lemmatized sentences
    for sentence in data:
        lemmatized_list = []  # To store lemmatized words for the current sentence
        for word in sentence.split():  # Split the sentence into words
            lemmatized_word = lemmatizer.lemmatize(word)  # Lemmatize the word
            lemmatized_list.append(lemmatized_word)  # Add to the lemmatized list
        x_lemma.append(" ".join(lemmatized_list))  # Combine into a sentence
    return x_lemma  # Return the lemmatized sentences


In [25]:
# Apply preprocessing to the text and title columns
X_train['reviews.text'] = X_train['reviews.text'].apply(lemma_builder)
X_train['reviews.title'] = X_train['reviews.title'].apply(lemma_builder)

X_test['reviews.text'] = X_test['reviews.text'].apply(lemma_builder)
X_test['reviews.title'] = X_test['reviews.title'].apply(lemma_builder)

# Verify the results
print("Preprocessed X_train sample:")
print(X_train.head())

print("Preprocessed X_test sample:")
X_test.head()

Preprocessed X_train sample:
                                            reviews.text  \
44470                                          [perfect]   
16574  [it, great, tablet, for, destructive, year, ol...   
17157  [right, after, we, bought, it, the, charging, ...   
24097  [this, speaker, is, fun, and, intuitive, alexi...   
43584  [doe, the, job, but, would, like, to, be, able...   

                           reviews.title  
44470                      [good, value]  
16574                     [good, tablet]  
17157  [good, warranty, and, protection]  
24097                              [fun]  
43584                       [four, star]  
Preprocessed X_test sample:


Unnamed: 0,reviews.text,reviews.title
50163,"[last, long, good, price, and, keep, my, kid, ...","[good, price, and, keep, my, kid, toy, running]"
15220,"[bought, the, tablet, and, it, is, very, nice,...","[alright, to, do, amazon, or, play, game, watc..."
11996,"[battery, life, great, camera, touring, aps, m...","[support, gps, map, apps, great, add, for, mc,..."
47742,"[needed, some, and, the, price, is, good, like...","[work, a, well, a, other, brand, so]"
44437,"[they, work, well, so, far, cheaper, than, oth...","[good, so, far]"


In [26]:
def tfidf_transformer(data, ngram_range=(1, 3), min_df=2, max_df=0.5, max_features=None):
    print(f"Data type: {type(data)}, Length: {len(data)}")  # Debugging
    print(f"Sample data: {data[:3]}")  # Show a few examples for debugging
    
    # Initialize the TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(
        ngram_range=ngram_range,
        min_df=min_df,
        max_df=max_df,
        max_features=max_features
    )
    
    # Fit and transform the data
    tfidf_matrix = vectorizer.fit_transform(data)
    
    return tfidf_matrix, vectorizer


In [27]:
# Convert lists of words back into single strings
X_train['reviews.text'] = X_train['reviews.text'].apply(lambda x: " ".join(x) if isinstance(x, list) else x)
X_test['reviews.text'] = X_test['reviews.text'].apply(lambda x: " ".join(x) if isinstance(x, list) else x)

X_train['reviews.title'] = X_train['reviews.title'].apply(lambda x: " ".join(x) if isinstance(x, list) else x)
X_test['reviews.title'] = X_test['reviews.title'].apply(lambda x: " ".join(x) if isinstance(x, list) else x)


# Apply TF-IDF transformation to the 'reviews.text' column
X_train_text_tfidf, text_tfidf_vectorizer = tfidf_transformer(X_train['reviews.text'])

# Use the same vectorizer to transform the test data
X_test_text_tfidf = text_tfidf_vectorizer.transform(X_test['reviews.text'])

# Apply TF-IDF transformation to the 'reviews.title' column
X_train_title_tfidf, title_tfidf_vectorizer = tfidf_transformer(X_train['reviews.title'])

# Use the same vectorizer to transform the test data
X_test_title_tfidf = title_tfidf_vectorizer.transform(X_test['reviews.title'])

# Check the shapes of the TF-IDF matrices
print(f"X_train_text TF-IDF shape: {X_train_text_tfidf.shape}")
print(f"X_test_text TF-IDF shape: {X_test_text_tfidf.shape}")
print(f"X_train_title TF-IDF shape: {X_train_title_tfidf.shape}")
print(f"X_test_title TF-IDF shape: {X_test_title_tfidf.shape}")


Data type: <class 'pandas.core.series.Series'>, Length: 38435
Sample data: 44470                                              perfect
16574    it great tablet for destructive year old it st...
17157    right after we bought it the charging port bro...
Name: reviews.text, dtype: object
Data type: <class 'pandas.core.series.Series'>, Length: 38435
Sample data: 44470                      good value
16574                     good tablet
17157    good warranty and protection
Name: reviews.title, dtype: object
X_train_text TF-IDF shape: (38435, 168074)
X_test_text TF-IDF shape: (9609, 168074)
X_train_title TF-IDF shape: (38435, 13722)
X_test_title TF-IDF shape: (9609, 13722)


In [28]:
print(X_train_text_tfidf)

  (0, 105051)	1.0
  (1, 70257)	0.08813900609609394
  (1, 55423)	0.055664096204052216
  (1, 127502)	0.06855693200949807
  (1, 46284)	0.09033649767583456
  (1, 34260)	0.24837336756098574
  (1, 165894)	0.09887320537498577
  (1, 97764)	0.10004072823786457
  (1, 125378)	0.11404328542266125
  (1, 53582)	0.13151201482916178
  (1, 126364)	0.16986736529097995
  (1, 86205)	0.12866909093759912
  (1, 95032)	0.10583998198232838
  (1, 71606)	0.12582021326461182
  (1, 56732)	0.12417880943446727
  (1, 127834)	0.10212153260765859
  (1, 46842)	0.2611281179752643
  (1, 166011)	0.11175781694725338
  (1, 97935)	0.20050993169491496
  (1, 73384)	0.1723550121434387
  (1, 125453)	0.19773275069713786
  (1, 53617)	0.19978614498426162
  (1, 126371)	0.2539450150491946
  (1, 47684)	0.18942545628195862
  (1, 86273)	0.20535779247247044
  :	:
  (38434, 125868)	0.09370540716664666
  (38434, 121856)	0.09112776217909671
  (38434, 121857)	0.09112776217909671
  (38434, 66768)	0.09370540716664666
  (38434, 111063)	0.0891283

In [29]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Initialize the SVM classifier
clf_svm = SVC(kernel='rbf', gamma='scale', C=100.0, random_state=42)  # You can adjust the kernel and C parameter


# Train the SVM model
clf_svm.fit(X_train_text_tfidf, y_train_sentiment)

In [32]:
# Make predictions
y_pred_svm = clf_svm.predict(X_test_text_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test_sentiment, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test_sentiment, y_pred_svm))

Accuracy: 0.9169528566968467

Classification Report:
               precision    recall  f1-score   support

    negative       0.00      0.00      0.00       369
     neutral       0.00      0.00      0.00       415
    positive       0.92      1.00      0.96      8825

    accuracy                           0.92      9609
   macro avg       0.31      0.33      0.32      9609
weighted avg       0.84      0.92      0.88      9609



In [33]:
import pickle

# Save the model to a file
with open("svm_sentiment_model.pkl", "wb") as file:
    pickle.dump(clf_svm, file)

print("Model saved successfully.")

Model saved successfully.


: 

## Pre-processing for later entries

In [30]:
# Preprocessing function for later entries

def preprocess_input(text, title):
    """
    Preprocesses input text and title during inference.
    """
    processed_text = clean_artefacts(text)
    processed_title = clean_artefacts(title)
    return processed_text, processed_title

In [31]:
# Run function on 'reviews.text' and 'reviews.title'

if isinstance(input, list):
    input = pd.DataFrame(input, columns=['reviews.text', 'reviews.title'])

# *****FILE DROPPED*****

Following reading this article and choosing to use PySentimiento which requires almost raw text.

Plus strategy to train a svm model for sentiment analysis will probably be less efficient than using a trained model.