**IMPORTING REQUIRED LIBRARIES**

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import re
import warnings
warnings.simplefilter("ignore")

**LOADING THE AMAZON PRODUCTS DATASET**

In [3]:
dt = pd.read_csv(r"C:\Users\HP\Downloads\archive (14)\amazon.csv")
print(dt.info())
print(dt.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465 entries, 0 to 1464
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   product_id           1465 non-null   object
 1   product_name         1465 non-null   object
 2   category             1465 non-null   object
 3   discounted_price     1465 non-null   object
 4   actual_price         1465 non-null   object
 5   discount_percentage  1465 non-null   object
 6   rating               1465 non-null   object
 7   rating_count         1463 non-null   object
 8   about_product        1465 non-null   object
 9   user_id              1465 non-null   object
 10  user_name            1465 non-null   object
 11  review_id            1465 non-null   object
 12  review_title         1465 non-null   object
 13  review_content       1465 non-null   object
 14  img_link             1465 non-null   object
 15  product_link         1465 non-null   object
dtypes: obj

**DATA PREPROCESSING**

1) Handling Missing Values

In [4]:
if dt.isna().sum().sum() == 0:
  print("No Missing Values in the Dataset")
else:
  print("Shape of Dataset before Removing Missing Values",dt.shape)
  print("Null Count is: ",dt.isna().sum().sum())
  dt = dt.dropna(axis=0, how="any")
  print("Shape of Dataset after Removing Missing Values",dt.shape)
  print("Null Count is: ",dt.isna().sum().sum())

Shape of Dataset before Removing Missing Values (1465, 16)
Null Count is:  2
Shape of Dataset after Removing Missing Values (1463, 16)
Null Count is:  0


2) Removing Unnecessary Features

In [5]:
columns_to_drop = ["product_id","user_id","user_name","review_id","img_link","product_link"]
print("Shape of the Dataset before Removing unnecessary features: ",dt.shape)
dt = dt.drop(columns=columns_to_drop, axis=1)
print("Shape of the Dataset after Removing unnecessary features: ",dt.shape)
print("Null Count is: ",dt.isna().sum().sum())

Shape of the Dataset before Removing unnecessary features:  (1463, 16)
Shape of the Dataset after Removing unnecessary features:  (1463, 10)
Null Count is:  0


In [6]:
print(dt.info())
print(dt.dtypes)
print("Null Count is: ",dt.isna().sum().sum())

<class 'pandas.core.frame.DataFrame'>
Index: 1463 entries, 0 to 1464
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   product_name         1463 non-null   object
 1   category             1463 non-null   object
 2   discounted_price     1463 non-null   object
 3   actual_price         1463 non-null   object
 4   discount_percentage  1463 non-null   object
 5   rating               1463 non-null   object
 6   rating_count         1463 non-null   object
 7   about_product        1463 non-null   object
 8   review_title         1463 non-null   object
 9   review_content       1463 non-null   object
dtypes: object(10)
memory usage: 125.7+ KB
None
product_name           object
category               object
discounted_price       object
actual_price           object
discount_percentage    object
rating                 object
rating_count           object
about_product          object
review_title          

3) Converting Necessary Features into Numerical

In [7]:
print("Shape of the Dataset",dt.shape)
print("Null Count is: ",dt.isna().sum().sum())


# Remove unwanted characters and convert to numeric
dt["discounted_price"] = pd.to_numeric(dt["discounted_price"].str.replace('[^0-9.]', '', regex=True), errors='coerce')
dt["actual_price"] = pd.to_numeric(dt["actual_price"].str.replace('[^0-9.]', '', regex=True), errors='coerce')
dt["discount_percentage"] = pd.to_numeric(dt["discount_percentage"].str.replace('%', '', regex=False), errors='coerce')
dt["rating"] = pd.to_numeric(dt["rating"], errors='coerce')
dt["rating_count"] = pd.to_numeric(dt["rating_count"].str.replace(',', ''), errors='coerce')

print("Data Types are: ",dt.dtypes)
print("Null Values Count: ",dt.isna().sum().sum())
print("Shape of the Dataset",dt.shape)
dt = dt.dropna(axis=0)
print("Null Values Count: ",dt.isna().sum().sum())

Shape of the Dataset (1463, 10)
Null Count is:  0
Data Types are:  product_name            object
category                object
discounted_price       float64
actual_price           float64
discount_percentage      int64
rating                 float64
rating_count             int64
about_product           object
review_title            object
review_content          object
dtype: object
Null Values Count:  1
Shape of the Dataset (1463, 10)
Null Values Count:  0


**PROCESSING THE DATA OF ABOUT_PRODUCT, REVIEW_TITLE AND REVIEW CONTENT**

1) Import Required Libraries

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

2) Downloading the nltk Resources

In [9]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

3) Considering stop words

In [10]:
# Tokenization and removing stop words
stop_words = set(stopwords.words('english'))

4) Defining the function to process the text data like converting into lower cases

In [11]:
def preprocess_text(text):
    words = word_tokenize(text.lower())  # Tokenization and convert to lowercase
    filtered_words = [word for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(filtered_words)

5) Applying the function to three features about_product, review_title, review_content

In [12]:
dt['product_name'] = dt['product_name'].apply(preprocess_text)
dt['category'] = dt['category'].apply(preprocess_text)
dt['about_product'] = dt['about_product'].apply(preprocess_text)
dt['review_title'] = dt['review_title'].apply(preprocess_text)
dt['review_content'] = dt['review_content'].apply(preprocess_text)

6) Creating the Object for PorterStemmer and Defining the function to find the root words, and then applying the function to the same three attributes

In [13]:
# Using Porter Stemmer for stemming
stemmer = PorterStemmer()

def stem_text(text):
    return ' '.join([stemmer.stem(word) for word in word_tokenize(text)])

dt['product_name'] = dt['product_name'].apply(stem_text)
dt['category'] = dt['category'].apply(stem_text)
dt['about_product'] = dt['about_product'].apply(stem_text)
dt['review_title'] = dt['review_title'].apply(stem_text)
dt['review_content'] = dt['review_content'].apply(stem_text)

7) Vectorization , Convertion the text into Numerical Values and concatination

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define the vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features as needed

# Apply vectorization to each text column
about_product_tfidf = tfidf_vectorizer.fit_transform(dt['about_product']).toarray()
review_title_tfidf = tfidf_vectorizer.fit_transform(dt['review_title']).toarray()
review_content_tfidf = tfidf_vectorizer.fit_transform(dt['review_content']).toarray()

# Create new DataFrames for the vectorized text columns
about_product_df = pd.DataFrame(about_product_tfidf, columns=[f'about_product_{i}' for i in range(about_product_tfidf.shape[1])])
review_title_df = pd.DataFrame(review_title_tfidf, columns=[f'review_title_{i}' for i in range(review_title_tfidf.shape[1])])
review_content_df = pd.DataFrame(review_content_tfidf, columns=[f'review_content_{i}' for i in range(review_content_tfidf.shape[1])])

# Overwrite the original columns with vectorized values
dt['about_product'] = about_product_df.values.tolist()
dt['review_title'] = review_title_df.values.tolist()
dt['review_content'] = review_content_df.values.tolist()


In [15]:
print(dt.isna().sum().sum())
print(dt.dtypes)

0
product_name            object
category                object
discounted_price       float64
actual_price           float64
discount_percentage      int64
rating                 float64
rating_count             int64
about_product           object
review_title            object
review_content          object
dtype: object


**HANDLING THE LIST DATATYPE TO SUM VALUE**

In [16]:
df = dt["about_product"]

for i in df.index:
    if isinstance(df[i], list):
        df[i] = sum(df[i])
    else:
        df[i] = dt["about_product"].mean()
dt["about_product"] = pd.to_numeric(df, errors="coerce")
print(dt["about_product"])


df = dt["review_title"]

for i in df.index:
    if isinstance(df[i], list):
        df[i] = sum(df[i])
    else:
        df[i] = dt["review_title"].mean()
dt["review_title"] = pd.to_numeric(df, errors="coerce")
print(dt["review_title"])

df = dt["review_content"]

for i in df.index:
    if isinstance(df[i], list):
        df[i] = sum(df[i])
    else:
        df[i] = dt["review_content"].mean()
dt["review_content"] = pd.to_numeric(df, errors="coerce")
print(dt["review_content"])

0       6.406571
1       6.061835
2       7.701186
3       7.213303
4       5.997835
          ...   
1460    3.826340
1461    1.638767
1462    4.024213
1463    5.815984
1464    4.679313
Name: about_product, Length: 1462, dtype: float64
0       3.205097
1       3.454594
2       3.098753
3       3.645471
4       3.454400
          ...   
1460    3.254151
1461    4.678732
1462    2.821205
1463    2.855877
1464    3.159813
Name: review_title, Length: 1462, dtype: float64
0       5.079742
1       6.691733
2       2.946653
3       5.483776
4       7.155735
          ...   
1460    5.550665
1461    6.906350
1462    6.410972
1463    3.816183
1464    6.898724
Name: review_content, Length: 1462, dtype: float64


**CONVERTING PRODUCT NAME AND CATEGORY TO NUMERICAL**

In [17]:
print(dt.dtypes)
# Label encoding
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
dt["product_name"] = label_encoder.fit_transform(dt['product_name'])
dt["product_name"] = pd.to_numeric(dt["product_name"], errors="coerce")

dt["category"] = label_encoder.fit_transform(dt['category'])
dt["category"] = pd.to_numeric(dt["category"], errors="coerce")
print(dt.dtypes)

product_name            object
category                object
discounted_price       float64
actual_price           float64
discount_percentage      int64
rating                 float64
rating_count             int64
about_product          float64
review_title           float64
review_content         float64
dtype: object
product_name             int32
category                 int32
discounted_price       float64
actual_price           float64
discount_percentage      int64
rating                 float64
rating_count             int64
about_product          float64
review_title           float64
review_content         float64
dtype: object


**SPLITTING THE DATASET INTO DEPENDENT AND INDEPENDENT FEATURES**

In [18]:
x = dt.drop(columns=["actual_price"])
y = dt["actual_price"]

**DEFINING TRAINING AND TESTING DATA**

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3)
print(len(x_train),len(x_test),len(y_train),len(y_test))

1023 439 1023 439


**CREATING MODEL WITH XGBOOST REGRESSORt**

In [20]:
xgb_regr = XGBRegressor()

**FITTING THE MODEL TO THE TRAINING DATA**

In [21]:
xgb_regr.fit(x_train,y_train)

**PREDICTIONS ON THE TEST SET**

In [22]:
y_pred = xgb_regr.predict(x_test)

**PERFORMANCE MEASURE CALCULATION**

In [23]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}, MSE: {mse}, R-squared: {r2}")


MAE: 297.19721931822477, MSE: 698440.5713543842, R-squared: 0.9934717944494367


In [24]:
def calculate_price(pn,cat,dpri,dperc,rate,rate_c,abp,rev_t,rev_c):
    user_input = {
        'product_name': pn,
        'category': cat,
        'discounted_price': dpri,
        'discount_percentage': dperc,
        'rating': rate,
        'rating_count': rate_c,
        'about_product': abp,
        'review_title' : rev_t,
        'review_content' : rev_c
    }

    # Convert user input to a DataFrame
    user_dt = pd.DataFrame([user_input])
    print(user_dt)
    user_dt['product_name'] = user_dt['product_name'].apply(preprocess_text)
    user_dt['product_name'] = user_dt['product_name'].apply(stem_text)
    user_dt["product_name"] = label_encoder.fit_transform(user_dt['product_name'])
    user_dt["product_name"] = pd.to_numeric(user_dt["product_name"], errors="coerce")

    user_dt['category'] = user_dt['category'].apply(preprocess_text)
    user_dt['category'] = user_dt['category'].apply(stem_text)
    user_dt["category"] = label_encoder.fit_transform(user_dt['category'])
    user_dt["category"] = pd.to_numeric(user_dt["category"], errors="coerce")

    about_product_tfidf = tfidf_vectorizer.fit_transform(user_dt['about_product']).toarray()
    about_product_df = pd.DataFrame(about_product_tfidf, columns=[f'about_product_{i}' for i in range(about_product_tfidf.shape[1])])
    user_dt['about_product'] = about_product_df.values.tolist()
    df = user_dt["about_product"]

    for i in df.index:
        if isinstance(df[i], list):
            df[i] = sum(df[i])
        else:
            df[i] = user_dt["about_product"].mean()
    user_dt["about_product"] = pd.to_numeric(df, errors="coerce")

    user_dt['review_title'] = user_dt['review_title'].apply([preprocess_text])
    user_dt['review_title'] = user_dt['review_title'].apply(stem_text)
    review_title_tfidf = tfidf_vectorizer.fit_transform(user_dt['review_title']).toarray()
    review_title_df = pd.DataFrame(review_title_tfidf, columns=[f'review_title_{i}' for i in range(review_title_tfidf.shape[1])])
    user_dt['review_title'] = review_title_df.values.tolist()
    df = user_dt["review_title"]
    for i in df.index:
        if isinstance(df[i], list):
            df[i] = sum(df[i])
        else:
            df[i] = user_dt["review_title"].mean()
    user_dt["review_title"] = pd.to_numeric(df, errors="coerce")


    user_dt['review_content'] = user_dt['review_content'].apply(preprocess_text)
    user_dt['review_content'] = user_dt['review_content'].apply(stem_text)
    review_content_tfidf = tfidf_vectorizer.fit_transform(user_dt['review_content']).toarray()
    review_content_df = pd.DataFrame(review_content_tfidf, columns=[f'review_content_{i}' for i in range(review_content_tfidf.shape[1])])
    user_dt['review_content'] = review_content_df.values.tolist()
    df = user_dt["review_content"]
    for i in df.index:
        if isinstance(df[i], list):
            df[i] = sum(df[i])
        else:
            df[i] = user_dt["review_content"].mean()
    user_dt["review_content"] = pd.to_numeric(df, errors="coerce")

    #print(user_dt)
    #print(user_dt.dtypes)

    user_pred = xgb_regr.predict(user_dt)

    print("Predicted Price:", user_pred[0])
    print(type(user_pred[0]))
    return float(user_pred[0])

In [25]:
import tkinter as tk
from tkinter import ttk
from tkinter import Text
from tkinter import *

def predict_price():
    # Retrieve values from text fields
    product_name = product_name_entry.get("1.0", tk.END).strip()
    category = category_entry.get("1.0", tk.END).strip()
    discounted_price = float(discounted_price_entry.get("1.0", tk.END).strip())
    discount_percentage = float(discount_percentage_entry.get("1.0", tk.END).strip())
    rating = float(rating_entry.get("1.0", tk.END).strip())
    rating_count = int(rating_count_entry.get("1.0", tk.END).strip())
    about_product = about_product_entry.get("1.0", tk.END).strip()
    review_title = review_title_entry.get("1.0", tk.END).strip()
    review_content = review_content_entry.get("1.0", tk.END).strip()

    # Include the code for prediction here
    # You can use the retrieved values for prediction or any other processing

    # For demonstration, printing the retrieved values
    print("Product Name:", product_name)
    print("Category:", category)
    print("Discounted Price:", discounted_price)
    print("Discount Percentage:", discount_percentage)
    print("Rating:", rating)
    print("Rating Count:", rating_count)
    print("About Product:", about_product)
    print("Review Title:", review_title)
    print("Review Content:", review_content)
    final_price = calculate_price(product_name,category,discounted_price,discount_percentage,rating,rating_count,about_product,review_title, review_content)
    predicted_price_var.set(f"Recommended Price(INR): {final_price:.2f}")

# GUI Setup
root = tk.Tk()
root.title("Product Price Recommendation")
root.geometry("3000x3000")  # Adjusted size
root.configure(bg='#FFD700')  # Set background color to gold

# Fonts
font_style = ("cursive", 13, "bold")

heading_label = ttk.Label(root, text="PRICE RECOMMENDATION FOR ONLINE SELLERS", font=("arial", 18, "bold"), background="green", foreground="white")
heading_label.place(relx=0.5, rely=0.01, anchor=tk.CENTER)

# Product Name
product_name_label = ttk.Label(root, text="Product Name:", font=font_style, background='#FFD700')
product_name_label.grid(row=0, column=0, pady=(50, 10), padx=(200, 0), sticky="e")

product_name_entry = Text(root, font=font_style, width="80", height="3")
product_name_entry.grid(row=0, column=1, pady=(50, 10), padx=(50, 10), sticky="w", columnspan=2)

# Category
category_label = ttk.Label(root, text="Category:", font=font_style, background='#FFD700')
category_label.grid(row=1, column=0, pady=10, padx=(200, 0), sticky="e")

category_entry = Text(root, font=font_style,width="80", height="3")
category_entry.grid(row=1, column=1, pady=10, padx=(50, 10), sticky="w", columnspan=2)

# Discounted Price
discounted_price_label = ttk.Label(root, text="Discounted Price:", font=font_style, background='#FFD700')
discounted_price_label.grid(row=2, column=0, pady=10, padx=(200, 0), sticky="e")

discounted_price_entry = Text(root, font=font_style,width="50", height="2")
discounted_price_entry.grid(row=2, column=1, pady=10, padx=(50, 10), sticky="w", columnspan=2)

# Discount Percentage
discount_percentage_label = ttk.Label(root, text="Discount Percentage:", font=font_style, background='#FFD700')
discount_percentage_label.grid(row=3, column=0, pady=10, padx=(200, 0), sticky="e")

discount_percentage_entry = Text(root, font=font_style,width="50", height="2")
discount_percentage_entry.grid(row=3, column=1, pady=10, padx=(50, 10), sticky="w", columnspan=2)

# Rating
rating_label = ttk.Label(root, text="Rating:", font=font_style, background='#FFD700')
rating_label.grid(row=4, column=0, pady=10, padx=(200, 0), sticky="e")

rating_entry = Text(root, font=font_style, width="30", height="2")
rating_entry.grid(row=4, column=1, pady=10, padx=(50, 10), sticky="w", columnspan=2)

# Rating Count
rating_count_label = ttk.Label(root, text="Rating Count:", font=font_style, background='#FFD700')
rating_count_label.grid(row=5, column=0, pady=10, padx=(200, 0), sticky="e")

rating_count_entry = Text(root, font=font_style,width="30", height="2")
rating_count_entry.grid(row=5, column=1, pady=10, padx=(50, 10), sticky="w", columnspan=2)

# About Product
about_product_label = ttk.Label(root, text="About Product:", font=font_style, background='#FFD700')
about_product_label.grid(row=6, column=0, pady=10, padx=(200, 0), sticky="e")

about_product_entry = Text(root, font=font_style,width="80", height="4")
about_product_entry.grid(row=6, column=1, pady=10, padx=(50, 10), sticky="w", columnspan=2)

# Review Title
review_title_label = ttk.Label(root, text="Review Title:", font=font_style, background='#FFD700')
review_title_label.grid(row=7, column=0, pady=10, padx=(200, 0), sticky="e")

review_title_entry = Text(root, font=font_style, width="30", height="2")
review_title_entry.grid(row=7, column=1, pady=10, padx=(50, 10), sticky="w", columnspan=2)

# Review Content
review_content_label = ttk.Label(root, text="Review Content:", font=font_style, background='#FFD700')
review_content_label.grid(row=8, column=0, pady=10, padx=(200, 0), sticky="e")

review_content_entry = Text(root, font=font_style, width="80", height="3")
review_content_entry.grid(row=8, column=1, pady=10, padx=(50, 10), sticky="w", columnspan=2)

# Predict Button
predict_button = ttk.Button(root, text="Recommend Price", command=predict_price, style='TButton')
predict_button.grid(row=9, column=0, columnspan=3, pady=20)

predicted_price_var = tk.StringVar()
predicted_price_entry = ttk.Entry(root, textvariable=predicted_price_var, font=font_style, width="40",state="readonly", foreground="green", background="white")
predicted_price_entry.grid(row=9, column=1, pady=20, padx=(700, 10), sticky="w")

# Run the GUI
root.mainloop()
