In [4]:
import pandas as pd
import re

# Load your dataset
df = pd.read_csv('electronics_product.csv')

# Function to clean price columns
def clean_price(price):
    """
    This function takes a price string, removes non-numeric characters,
    and converts it into a float value for further processing.
    """
    # Convert price to string before applying regex
    price = str(price)
    clean_price = re.sub(r'[^\d.]', '', price)  # Remove any non-numeric characters
    return float(clean_price) if clean_price else None

# Apply the function to both price columns
df['discount_price'] = df['discount_price'].apply(clean_price)
df['actual_price'] = df['actual_price'].apply(clean_price)

# Check for successful conversion
df[['discount_price', 'actual_price']].head()

Unnamed: 0,discount_price,actual_price
0,10999.0,18999.0
1,18999.0,19999.0
2,1999.0,2299.0
3,15999.0,24999.0
4,18999.0,19999.0


In [5]:
# Check for missing values
df.isnull().sum()


Unnamed: 0          0
name                0
main_category       0
sub_category        0
image               0
link                0
ratings            95
no_of_ratings      95
discount_price    484
actual_price       70
dtype: int64

In [6]:

df = df.drop('Unnamed: 0', axis=1)

In [7]:
# Create a copy of the DataFrame to avoid SettingWithCopyWarning
df_cleaned = df.copy()

# Fill missing ratings and no_of_ratings with 0
df_cleaned['ratings'] = df_cleaned['ratings'].fillna(0)
df_cleaned['no_of_ratings'] = df_cleaned['no_of_ratings'].fillna(0)

# Fill missing discount prices with the mean discount price
df_cleaned['discount_price'] = df_cleaned['discount_price'].fillna(df_cleaned['discount_price'].mean())

# Fill missing actual prices with the mean actual price
df_cleaned['actual_price'] = df_cleaned['actual_price'].fillna(df_cleaned['actual_price'].mean())

# Check if any missing values remain
missing_values = df_cleaned.isnull().sum()
print(missing_values)
df_cleaned.shape

name              0
main_category     0
sub_category      0
image             0
link              0
ratings           0
no_of_ratings     0
discount_price    0
actual_price      0
dtype: int64


(9600, 9)

In [9]:

df_cleaned = df_cleaned.drop_duplicates()
df_cleaned.shape

df_cleaned['name'] = df_cleaned['name'].str.lower()
df_cleaned.shape

print(df_cleaned[['discount_price', 'actual_price']].describe())

print(df_cleaned.dtypes)

df_cleaned['ratings'] = pd.to_numeric(df_cleaned['ratings'], errors='coerce')
df_cleaned['no_of_ratings'] = pd.to_numeric(df_cleaned['no_of_ratings'], errors='coerce')

print(df_cleaned[['ratings', 'no_of_ratings']].isnull().sum())


       discount_price   actual_price
count     9600.000000    9600.000000
mean      2976.275592    5009.698254
std       8639.082346   12164.404252
min         10.000000      20.000000
25%        299.000000     849.000000
50%        620.500000    1499.000000
75%       1895.500000    3499.000000
max     134999.000000  161999.000000
name               object
main_category      object
sub_category       object
image              object
link               object
ratings            object
no_of_ratings      object
discount_price    float64
actual_price      float64
dtype: object
ratings            24
no_of_ratings    5012
dtype: int64


In [10]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
df_cleaned[['ratings', 'no_of_ratings']] = imputer.fit_transform(df_cleaned[['ratings', 'no_of_ratings']])

print(df_cleaned[['ratings', 'no_of_ratings']].isnull().sum())

df_cleaned.shape

ratings          0
no_of_ratings    0
dtype: int64


(9600, 9)

In [11]:

import numpy as np

# Create new features in the cleaned DataFrame
df_cleaned['name_length'] = df_cleaned['name'].str.len()
df_cleaned['price_difference'] = df_cleaned['actual_price'] - df_cleaned['discount_price']
df_cleaned['rating_ratio'] = df_cleaned['ratings'] / df_cleaned['no_of_ratings']
df_cleaned['log_actual_price'] = np.log1p(df_cleaned['actual_price'])
df_cleaned['discount_percentage'] = (df_cleaned['price_difference'] / df_cleaned['actual_price']) * 100

# Check the shape and columns of the updated DataFrame
print(df_cleaned.shape)

df_cleaned.shape


(9600, 14)


(9600, 14)

In [12]:

# Display rows with negative discount_price
negative_discounts = df_cleaned[df_cleaned['discount_price'] < 0]
print("Negative discount_price rows:")
print(negative_discounts)

# Display rows with negative actual_price
negative_actuals = df_cleaned[df_cleaned['actual_price'] < 0]
print("Negative actual_price rows:")
print(negative_actuals)

# Check for non-numeric values in the relevant columns
for col in ['discount_price', 'actual_price', 'ratings', 'no_of_ratings',
            'discount_percentage', 'name_length', 'price_difference',
            'rating_ratio', 'log_actual_price']:
    non_numeric = df_cleaned[col][pd.to_numeric(df_cleaned[col], errors='coerce').isnull()]
    if not non_numeric.empty:
        print(f"Non-numeric values found in column '{col}':")
        print(non_numeric)

Negative discount_price rows:
Empty DataFrame
Columns: [name, main_category, sub_category, image, link, ratings, no_of_ratings, discount_price, actual_price, name_length, price_difference, rating_ratio, log_actual_price, discount_percentage]
Index: []
Negative actual_price rows:
Empty DataFrame
Columns: [name, main_category, sub_category, image, link, ratings, no_of_ratings, discount_price, actual_price, name_length, price_difference, rating_ratio, log_actual_price, discount_percentage]
Index: []
Non-numeric values found in column 'rating_ratio':
365    NaN
507    NaN
1090   NaN
1245   NaN
1292   NaN
        ..
9059   NaN
9188   NaN
9472   NaN
9482   NaN
9541   NaN
Name: rating_ratio, Length: 95, dtype: float64


In [14]:

for col in ['discount_price', 'actual_price', 'ratings', 'no_of_ratings',
            'discount_percentage', 'name_length', 'price_difference',
            'rating_ratio', 'log_actual_price']:
    df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')  # Convert non-numeric to NaN

# Impute NaN values (you can choose a different strategy if needed)
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
df_cleaned[['discount_price', 'actual_price', 'ratings', 'no_of_ratings',
             'discount_percentage', 'name_length', 'price_difference',
             'rating_ratio', 'log_actual_price']] = imputer.fit_transform(
    df_cleaned[['discount_price', 'actual_price', 'ratings', 'no_of_ratings',
                 'discount_percentage', 'name_length', 'price_difference',
                 'rating_ratio', 'log_actual_price']])

df_cleaned.shape

(9600, 14)

In [15]:

# Display rows with negative discount_price
negative_discounts = df_cleaned[df_cleaned['discount_price'] < 0]
print("Negative discount_price rows:")
print(negative_discounts)

# Display rows with negative actual_price
negative_actuals = df_cleaned[df_cleaned['actual_price'] < 0]
print("Negative actual_price rows:")
print(negative_actuals)


Negative discount_price rows:
Empty DataFrame
Columns: [name, main_category, sub_category, image, link, ratings, no_of_ratings, discount_price, actual_price, name_length, price_difference, rating_ratio, log_actual_price, discount_percentage]
Index: []
Negative actual_price rows:
Empty DataFrame
Columns: [name, main_category, sub_category, image, link, ratings, no_of_ratings, discount_price, actual_price, name_length, price_difference, rating_ratio, log_actual_price, discount_percentage]
Index: []


In [16]:

df_cleaned = pd.get_dummies(df_cleaned, columns=['main_category', 'sub_category'], drop_first=True)

df_cleaned.shape

# Get the actual dummy column names created by pd.get_dummies
main_category_dummy_cols = [col for col in df_cleaned.columns if col.startswith('main_category_')]

# Print the first two dummy column names for example
print(main_category_dummy_cols[:2])


[]


In [17]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import joblib



# Function to preprocess the data
def preprocess_data(df):
    # Remove outliers using IQR method for the discount price
    Q1 = df['discount_price'].quantile(0.25)
    Q3 = df['discount_price'].quantile(0.75)
    IQR = Q3 - Q1
    df = df[(df['discount_price'] >= (Q1 - 1.5 * IQR)) & (df['discount_price'] <= (Q3 + 1.5 * IQR))]

    # Yeo-Johnson Transformation for the target variable (discount price)
    transformer = PowerTransformer(method='yeo-johnson')
    df['yeojohnson_discount'] = transformer.fit_transform(df[['discount_price']])

    return df, transformer

# Preprocess the dataset and get the transformer
df_cleaned, transformer = preprocess_data(df_cleaned)

# Define features and target variable
features = ['ratings', 'no_of_ratings', 'actual_price', 
        'rating_ratio', 'log_actual_price']
X = df_cleaned[features]
y = df_cleaned['yeojohnson_discount']  # Using transformed discount price as the target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the XGBoost model
model = xgb.XGBRegressor(random_state=42)
model.fit(X_train, y_train)

# Save the model and the transformer for later use
joblib.dump(model, 'xgboost_model.pkl')
joblib.dump(transformer, 'yeo_johnson_transformer.pkl')

print("Model and transformer saved successfully!")

# Function to make predictions and inverse transform the output
def predict_and_inverse_transform(features):
    # Load the model and transformer
    model = joblib.load('xgboost_model.pkl')
    transformer = joblib.load('yeo_johnson_transformer.pkl')

    # Make predictions
    predictions = model.predict(features)

    # Inverse transform the predictions to get original discount price
    original_predictions = transformer.inverse_transform(predictions.reshape(-1, 1))

    return original_predictions.flatten()

# Calculate RMSE on the test set
test_predictions = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, test_predictions))

# Inverse transform the test predictions for interpretation
original_test_predictions = transformer.inverse_transform(test_predictions.reshape(-1, 1)).flatten()

print(f"Root Mean Squared Error (RMSE): {rmse}")


Model and transformer saved successfully!
Root Mean Squared Error (RMSE): 0.6340970870987891


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['yeojohnson_discount'] = transformer.fit_transform(df[['discount_price']])


In [29]:
df_cleaned.to_csv('your_data.csv', index=False)


In [34]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import mean_squared_error
import joblib


# Function to preprocess the data
def preprocess_data(df):
    # Remove outliers using IQR method for the discount price
    Q1 = df['discount_price'].quantile(0.25)
    Q3 = df['discount_price'].quantile(0.75)
    IQR = Q3 - Q1
    df = df[(df['discount_price'] >= (Q1 - 1.5 * IQR)) & (df['discount_price'] <= (Q3 + 1.5 * IQR))]

    # Yeo-Johnson Transformation for the target variable (discount price)
    transformer = PowerTransformer(method='yeo-johnson')
    df['yeojohnson_discount'] = transformer.fit_transform(df[['discount_price']])

    return df, transformer

# Preprocess the dataset and get the transformer
df_cleaned, transformer = preprocess_data(df_cleaned)

# Define features and target variable
features = ['ratings', 'no_of_ratings', 'actual_price'
            , 'rating_ratio', 'log_actual_price']
X = df_cleaned[features]
y = df_cleaned['yeojohnson_discount']  # Using transformed discount price as the target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Save the model and the transformer for later use
joblib.dump(model, 'random_forest_model.pkl')
joblib.dump(transformer, 'yeo_johnson_transformer.pkl')

print("Model and transformer saved successfully!")

# Function to make predictions and inverse transform the output
def predict_and_inverse_transform(features):
    # Load the model and transformer
    model = joblib.load('random_forest_model.pkl')
    transformer = joblib.load('yeo_johnson_transformer.pkl')

    # Make predictions
    predictions = model.predict(features)

    # Inverse transform the predictions to get original discount price
    original_predictions = transformer.inverse_transform(predictions.reshape(-1, 1))

    return original_predictions.flatten()

# Calculate RMSE on the test set
test_predictions = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, test_predictions))

# Inverse transform the test predictions for interpretation
original_test_predictions = transformer.inverse_transform(test_predictions.reshape(-1, 1)).flatten()

print(f"Root Mean Squared Error (RMSE): {rmse}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['yeojohnson_discount'] = transformer.fit_transform(df[['discount_price']])


Model and transformer saved successfully!
Root Mean Squared Error (RMSE): 0.6143130286609501


