In [81]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns

In [72]:
# Load the dataset
df = pd.read_csv('C:/Users/study/Downloads/vscode/phone search.csv')

In [None]:
df.head(5)

In [None]:
df.info()

In [None]:
df.isnull().sum()

### pre-processing

In [76]:
# Drop unwanted columns
df_cleaned = df.drop(columns=[
    'unit_price',              # Too many missing values
    'unit_count',              # Too many missing values
    'coupon_text',             # Irrelevant for analysis
    'product_photo',           # Not relevant for analysis
    'product_url',             # Not relevant for analysis
])

In [None]:
df_cleaned.isnull().sum()

In [78]:
# Convert Data Types
df_cleaned['product_price'] = df_cleaned['product_price'].replace('[\$,]', '', regex=True).astype(float)
df_cleaned['product_original_price'] = df_cleaned['product_original_price'].replace('[\$,]', '', regex=True)
df_cleaned['product_original_price'] = pd.to_numeric(df_cleaned['product_original_price'], errors='coerce')

# Convert 'sales_volume' to numeric by cleaning "4K+" and other text
df_cleaned['sales_volume_cleaned'] = df_cleaned['sales_volume'].str.extract('(\d+\.?\d*)')
df_cleaned['sales_volume_cleaned'] = df_cleaned['sales_volume_cleaned'].astype(float) * np.where(df_cleaned['sales_volume'].str.contains('K'), 1000, 1)

In [79]:
# Fill missing prices with median values
df_cleaned['product_price'].fillna(df_cleaned['product_price'].median(), inplace=True)
df_cleaned['product_original_price'].fillna(df_cleaned['product_original_price'].median(), inplace=True)
df_cleaned['sales_volume_cleaned'].fillna(df_cleaned['sales_volume_cleaned'].median(), inplace=True)
df_cleaned['product_minimum_offer_price'].fillna(df_cleaned['product_price'], inplace=True)
df_cleaned['delivery'].fillna('Not Available', inplace=True)
df_cleaned['product_availability'].fillna('Not Available', inplace=True)
df_cleaned['currency'].fillna('USD', inplace=True)

# Drop rows where 'product_star_rating' or 'product_num_ratings' is missing
df_cleaned.dropna(subset=['product_star_rating', 'product_num_ratings'], inplace=True)

# Drop the original 'sales_volume' column as we now have 'sales_volume_cleaned'
df_cleaned.drop(columns=['sales_volume'], inplace=True)

In [None]:
df_cleaned.isnull().sum()

### Visualization

In [None]:
# Distribution of Product Prices
plt.figure(figsize=(10, 6))
sns.histplot(df_cleaned['product_price'], bins=30, kde=True, color='blue')
plt.title('Distribution of Product Prices')
plt.xlabel('Price ($)')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Price vs Rating with Prime and Non-Prime categories
plt.figure(figsize=(10, 6))
sns.lineplot(x='product_star_rating', y='product_price', hue='is_prime', data=df_cleaned, palette='Set1', markers=True)
plt.title('Price vs. Rating (Prime and Non-Prime)')
plt.xlabel('Product Star Rating')
plt.ylabel('Product Price ($)')
plt.legend(title='Prime Eligible')
plt.show()

In [None]:
# Distribution of Ratings
plt.figure(figsize=(10, 6))
sns.histplot(df_cleaned['product_star_rating'], bins=10, kde=True, color='green')
plt.title('Distribution of Product Star Ratings')
plt.xlabel('Star Rating')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Bar Plot: Average Price by Star Rating (Prime and Non-Prime)
plt.figure(figsize=(10, 6))
sns.barplot(x='product_star_rating', y='product_price', hue='is_prime', data=df_cleaned, palette='Set1', ci=None)
plt.title('Average Price by Rating (Prime and Non-Prime)')
plt.xlabel('Product Star Rating')
plt.ylabel('Average Product Price ($)')
plt.legend(title='Prime Eligible')
plt.show()

In [None]:
import seaborn as sns

# Grouping the data for the heatmap
heatmap_data = df_cleaned.groupby(['product_star_rating', 'is_prime', 'availability_status']).size().unstack(fill_value=0)

# Plotting a heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(heatmap_data, annot=True, fmt='d', cmap='YlGnBu', cbar=True)

plt.title('Product Availability by Star Rating and Prime Status')
plt.xlabel('Prime Status')
plt.ylabel('Product Star Rating')
plt.show()


### Prepare Data for Machine Learning

In [110]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures

In [99]:
# Select relevant columns for prediction
features = ['product_star_rating', 'product_num_ratings', 'is_prime', 'is_best_seller', 'is_amazon_choice']
X = df_cleaned[features]
y = df_cleaned['product_price']

# Convert categorical variables ('is_prime', 'is_best_seller', 'is_amazon_choice') to numerical
X = pd.get_dummies(X, columns=['is_prime', 'is_best_seller', 'is_amazon_choice'], drop_first=True)

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display model evaluation results
print("Mean Squared Error (MSE):", mse)
print("R-squared Score:", r2)

Mean Squared Error (MSE): 39342.80246867711
R-squared Score: -0.10498973367943343


In [105]:
# Select relevant columns for prediction
features = ['product_star_rating', 'product_num_ratings', 'is_prime', 'is_best_seller', 'is_amazon_choice']
X = df_cleaned[features]
y = df_cleaned['product_price']

# Convert categorical variables ('is_prime', 'is_best_seller', 'is_amazon_choice') to numerical
X = pd.get_dummies(X, columns=['is_prime', 'is_best_seller', 'is_amazon_choice'], drop_first=True)

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Decision Tree Regressor
dt_model = DecisionTreeRegressor(random_state=42)

# Fit the model
dt_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_dt = dt_model.predict(X_test)

# Evaluate the model's performance
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

# Display model evaluation results
print("Decision Tree Regressor Mean Squared Error (MSE):", mse_dt)
print("Decision Tree Regressor R-squared Score:", r2_dt)

Decision Tree Regressor Mean Squared Error (MSE): 82310.93483382354
Decision Tree Regressor R-squared Score: -1.3118012000630652


In [111]:
# Select relevant columns for prediction
features = ['product_star_rating', 'product_num_ratings', 'is_prime', 'is_best_seller', 'is_amazon_choice']
X = df_cleaned[features]
y = df_cleaned['product_price']

# Convert categorical variables to numerical
X = pd.get_dummies(X, columns=['is_prime', 'is_best_seller', 'is_amazon_choice'], drop_first=True)

# Create polynomial features
poly = PolynomialFeatures(degree=2)  # You can adjust the degree
X_poly = poly.fit_transform(X)

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
poly_model = LinearRegression()
poly_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = poly_model.predict(X_test)

# Evaluate the model's performance
mse_poly = mean_squared_error(y_test, y_pred)
r2_poly = r2_score(y_test, y_pred)

# Display model evaluation results
print("Polynomial Regression Mean Squared Error (MSE):", mse_poly)
print("Polynomial Regression R-squared Score:", r2_poly)

Polynomial Regression Mean Squared Error (MSE): 44322.01793450486
Polynomial Regression R-squared Score: -0.24483696433611946
