# Ecommerce Customers

# Problem Statement
A project with an Ecommerce company sells clothing online but they also have in-store style
and clothing advice sessions. Customers come in to the store, have sessions/meetings with
a personal stylist, then they can go home and order either on a mobile app or website for the
clothes they want. The company is trying to decide whether to focus their efforts on their
mobile app experience or their website. They have asked to help them figure it out.

In [None]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# for linear regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
#for decision tree
from sklearn.tree import DecisionTreeRegressor
#for KNN
from sklearn.neighbors import KNeighborsRegressor
# fot gradiend boosting regressor
from sklearn.ensemble import GradientBoostingRegressor
# for Random forest
from sklearn.ensemble import RandomForestRegressor
#for lasso regression model
from sklearn.linear_model import Ridge
#for Support Vector machine
from sklearn.svm import SVR

In [None]:
# import datasets
data=pd.read_csv('Ecommerce_Customers.csv')

In [None]:
data

In [None]:
data.shape

In [None]:
data.size

In [None]:
# Check null values in the dataset
data.isnull().sum()

In [None]:
# Summary statistics and info about the dataset
data.describe()

In [None]:
data.info()

In [None]:
# Check Duplicates
data.duplicated().any()

In [None]:
# Drop the columns which is not required for prediction
data1 = data.drop(['Email','Address','Avatar'],axis=1)

In [None]:
data1

In [None]:
# Check outliers
data1.boxplot('Avg Session Length')
plt.show()

In [None]:
data1.boxplot('Time on App')
plt.show()

In [None]:
data1.boxplot('Time on Website')
plt.show()

In [None]:
data1.boxplot('Length of Membership')
plt.show()

In [None]:
data1.boxplot()
plt.show()

In [None]:
# Function to remove outliers using IQR method
def remove_outliers(data1):
    for col in data1.select_dtypes(include='number').columns:
        # Calculate Q1, Q3, and IQR
        Q1 = data1[col].quantile(0.25)
        Q3 = data1[col].quantile(0.75)
        IQR = Q3 - Q1
        
        # Define lower and upper bounds for outliers
        lower_extreme = Q1 - 1.5 * IQR
        upper_extreme = Q3 + 1.5 * IQR
        
        # Remove outliers
        data1 = data1[(data1[col] > lower_extreme) & (data1[col] < upper_extreme)]
    
    return data1

In [None]:
# Remove outliers from all columns
new_data = remove_outliers(data1)
print(new_data)

In [None]:
# Visualization after removed outliers
new_data.boxplot()
plt.show()

In [None]:
# Compute the correlation matrix
corr_matrix=new_data.corr()
corr_matrix

In [None]:
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_matrix,annot=True)
plt.show()

# Visualization 

In [None]:
# Pairplot for a subset of variables
sns.pairplot(new_data)
plt.show()

In [None]:
# Correlation heatmap
sns.heatmap(new_data.corr(), annot=True, cmap='coolwarm')
plt.show()

In [None]:
# Univariate Analysis
# Histogram of Avg Session Length
plt.figure(figsize=(8, 6))
sns.histplot(new_data['Avg Session Length'], bins=20, kde=True)
plt.title('Distribution of Avg Session Length')
plt.xlabel('Avg Session Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Histogram of Time on App
plt.figure(figsize=(8, 6))
sns.histplot(new_data['Time on App'], bins=20, kde=True)
plt.title('Distribution of Time on App')
plt.xlabel('Time on App')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Histogram of Time on Website
plt.figure(figsize=(8, 6))
sns.histplot(new_data['Time on Website'], bins=20, kde=True)
plt.title('Distribution of Time on Website')
plt.xlabel('Time on Website')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Histogram of Length of Membership
plt.figure(figsize=(8, 6))
sns.histplot(new_data['Length of Membership'], bins=20, kde=True)
plt.title('Distribution of Length of Membership')
plt.xlabel('Length of Membership')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Scatter plot of Avg Session Length vs. Yearly Amount Spent
plt.figure(figsize=(8, 6))
sns.scatterplot(data=new_data, x='Avg Session Length', y='Yearly Amount Spent')
plt.title('Scatter plot of Avg Session Length vs. Yearly Amount Spent')
plt.xlabel('Avg Session Length')
plt.ylabel('Yearly Amount Spent')
plt.show()

In [None]:
# Scatter plot of Time on App vs. Yearly Amount Spent
plt.figure(figsize=(8, 6))
sns.scatterplot(data=new_data, x='Time on App', y='Yearly Amount Spent')
plt.title('Scatter plot of Time on App vs. Yearly Amount Spent')
plt.xlabel('Time on App')
plt.ylabel('Yearly Amount Spent')
plt.show()

In [None]:
# Scatter plot of Time on Website vs. Yearly Amount Spent
plt.figure(figsize=(8, 6))
sns.scatterplot(data=new_data, x='Time on Website', y='Yearly Amount Spent')
plt.title('Scatter plot of Time on Website vs. Yearly Amount Spent')
plt.xlabel('Time on Website')
plt.ylabel('Yearly Amount Spent')
plt.show()

In [None]:
# Scatter plot of Length of Membership vs. Yearly Amount Spent
plt.figure(figsize=(8, 6))
sns.scatterplot(data=new_data, x='Length of Membership', y='Yearly Amount Spent')
plt.title('Scatter plot of Length of Membership vs. Yearly Amount Spent')
plt.xlabel('Length of Membership')
plt.ylabel('Yearly Amount Spent')
plt.show()

# Auto EDA

In [None]:
# Generate the EDA report using sweetviz
import sweetviz as sv

In [None]:
report = sv.analyze(data)

In [None]:
report.show_html('sweetviz_report.html')

# Data Processing

In [None]:
# Split the data into features and target

In [None]:
x =new_data.iloc[: , 0:4]
y =new_data.iloc[:, 4]

In [None]:
# Import train_test_split to split the datas
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.3,random_state=42)

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
train_accuracy = []
test_accuracy = []

#  Model Building

# MODEL 1:  Linear Regression

In [None]:
print("Linear Regression Model:")
linear_model = LinearRegression()
linear_model.fit(x_train, y_train)

In [None]:
#train and test predictions
y_train_pred = linear_model.predict(x_train)
y_test_pred = linear_model.predict(x_test)

### MODEL EVALUATION

In [None]:
mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)
train_accuracy.append(r2_train)
test_accuracy.append(r2_test)

In [None]:
print("Training Set Performance:")
print('-------------------------')
print("Mean Squared Error:", mse_train)
print("R-squared Score:", r2_train)

In [None]:
print("Test Set Performance:")
print('---------------------')
print("Mean Squared Error:", mse_test)
print("R-squared Score:", r2_test)

### Cross validation

In [None]:
from sklearn.neighbors import KNeighborsRegressor
linear_model = KNeighborsRegressor(n_neighbors=13)

In [None]:
training_error = []
test_error = []

In [None]:
for i in range(1,100):
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=10)
    linear_model.fit(x_train,y_train)
    y_pred_train = linear_model.predict(x_train)
    y_pred_test  = linear_model.predict(x_test)
    training_error.append(np.sqrt(mean_squared_error(y_train,y_pred_train)))
    test_error.append(np.sqrt(mean_squared_error(y_test,y_pred_test)))
print("Cross validation training Error:",np.mean(training_error).round(2))
print("Cross validation test Error:",np.mean(test_error).round(2))
print("variance:",(np.mean(test_error)-np.mean(training_error)).round(2))

# MODEL 2: Ridge Regression Model

In [None]:
ridge_model = Ridge(alpha = 1.0)
ridge_model.fit(x_train,y_train)

In [None]:
# test prediction and traning prediction
y_train_pred = ridge_model.predict(x_train)
y_test_pred = ridge_model.predict(x_test)


### Model Evaluation

In [None]:
mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)
train_accuracy.append(r2_train)
test_accuracy.append(r2_test)

In [None]:
print("Training Set Performance:")
print('-------------------------')
print("Mean Squared Error:", mse_train)
print("R-squared Score:", r2_train)

In [None]:
print("\nTest Set Performance:")
print('-----------------------')
print("Mean Squared Error:", mse_test)
print("R-squared Score:", r2_test)

# MODEL 3: Decision Tree Model

In [None]:
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(x_train, y_train)

In [None]:
# test prediction and training prediction
y_train_pred = dt_model.predict(x_train)
y_test_pred = dt_model.predict(x_test)

### Model Evaluation

In [None]:
mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)
train_accuracy.append(r2_train)
test_accuracy.append(r2_test)

In [None]:
print("Training Set Performance:")
print('-------------------------')
print("Mean Squared Error:", mse_train)
print("R-squared Score:", r2_train)

In [None]:
print("\nTest Set Performance:")
print('-----------------------')
print("Mean Squared Error:", mse_test)
print("R-squared Score:", r2_test)

### Cross Validation

In [None]:
from sklearn.neighbors import KNeighborsRegressor
gb_model = KNeighborsRegressor(n_neighbors=13)

In [None]:
training_error = []
test_error = []

In [None]:
for i in range(1,100):
    X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size=0.30,random_state=i)
    gb_model.fit(X_train,Y_train)
    Y_pred_train = dt_model.predict(X_train)
    Y_pred_test  = dt_model.predict(X_test)
    training_error.append(np.sqrt(mean_squared_error(Y_train,Y_pred_train)))
    test_error.append(np.sqrt(mean_squared_error(Y_test,Y_pred_test)))
print("Cross validation training Error:",np.mean(training_error).round(2))
print("Cross validation test Error:",np.mean(test_error).round(2))
print("variance:",(np.mean(test_error)-np.mean(training_error)).round(2))

# Model 4: Random Forest Model

In [None]:
rf_model = RandomForestRegressor(n_estimators = 100, max_samples=0.8, max_features=4, random_state = 42)
rf_model.fit(x_train,y_train)

In [None]:
# test prediction and traning prediction
y_train_pred = rf_model.predict(x_train)
y_test_pred = rf_model.predict(x_test)

### Model Evaluation

In [None]:
mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
r2_train = r2_score(y_train, y_train_pred)
r2_test  = r2_score(y_test, y_test_pred)
train_accuracy.append(r2_train)
test_accuracy.append(r2_test)

In [None]:
print("Training Set Performance:")
print('-------------------------')
print("Mean Squared Error:", mse_train)
print("R-squared Score:", r2_train)

In [None]:
print("Training Set Performance:")
print('-------------------------')
print("Mean Squared Error:", mse_train)
print("R-squared Score:", r2_train)

# Model 5: Gradient Boosting Regressor Model

In [None]:
print("\nGradient Boosting Regressor Model:")
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(x_train, y_train)

In [None]:
# test prediction and traning prediction
y_train_pred = gb_model.predict(x_train)
y_test_pred = gb_model.predict(x_test)

### Model Evaluation

In [None]:
mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)
train_accuracy.append(r2_train)
test_accuracy.append(r2_test)

In [None]:
print("Training Set Performance:")
print('-------------------------')
print("Mean Squared Error:", mse_train)
print("R-squared Score:", r2_train)

In [None]:
print("\nTest Set Performance:")
print('------------------------')
print("Mean Squared Error:", mse_test)
print("R-squared Score:", r2_test)

### Cross Validation

In [None]:
from sklearn.neighbors import KNeighborsRegressor
gb_model = KNeighborsRegressor(n_neighbors=13)

In [None]:
training_error = []
test_error = []

In [None]:
for i in range(1,100):
    X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size=0.30,random_state=i)
    gb_model.fit(X_train,Y_train)
    Y_pred_train = gb_model.predict(X_train)
    Y_pred_test  = gb_model.predict(X_test)
    training_error.append(np.sqrt(mean_squared_error(Y_train,Y_pred_train)))
    test_error.append(np.sqrt(mean_squared_error(Y_test,Y_pred_test)))
print("Cross validation training Error:",np.mean(training_error).round(2))
print("Cross validation test Error:",np.mean(test_error).round(2))
print("variance:",(np.mean(test_error)-np.mean(training_error)).round(2))

# Model 6: K - NEAREST NEIGHBOR REGRESSOR MODEL

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
std=StandardScaler()

In [None]:
sdata=std.fit_transform(new_data)

In [None]:
# convert array into dataframes 
std_data=pd.DataFrame(sdata,columns=new_data.columns)

In [None]:
sx =std_data.iloc[:, 0:4]
sy =std_data.iloc[:, 4]
# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(sx, sy, test_size=0.2, random_state=42)

In [None]:
# Create and train the KNN regressor model
knn_model = KNeighborsRegressor(n_neighbors=5)  # You can adjust the number of neighbors (K) here
knn_model.fit(X_train, Y_train)

In [None]:
# test prediction and traning prediction
y_train_pred = knn_model.predict(X_train)
y_test_pred = knn_model.predict(X_test)

### Model Evaluation

In [None]:
mse_train = mean_squared_error(Y_train, y_train_pred)
mse_test = mean_squared_error(Y_test, y_test_pred)
r2_train = r2_score(Y_train, y_train_pred)
r2_test = r2_score(Y_test, y_test_pred)
train_accuracy.append(r2_train)
test_accuracy.append(r2_test)

In [None]:
print("Training Set Performance:")
print('-------------------------')
print("Mean Squared Error:", mse_train)
print("R-squared Score:", r2_train)

In [None]:
print("\nTest Set Performance:")
print('-----------------------')
print("Mean Squared Error:", mse_test)
print("R-squared Score:", r2_test)

# Model 7 - Support Vector Machine Model

In [None]:
svm_model = SVR(kernel='linear', C = 1.0)
svm_model.fit(X_train,Y_train)

In [None]:
# test prediction and traning prediction
y_train_pred = svm_model.predict(X_train)
y_test_pred = svm_model.predict(X_test)

## Model Evaluation

In [None]:
mse_train = mean_squared_error(Y_train, y_train_pred)
mse_test = mean_squared_error(Y_test, y_test_pred)
r2_train = r2_score(Y_train, y_train_pred)
r2_test = r2_score(Y_test, y_test_pred)
train_accuracy.append(r2_train)
test_accuracy.append(r2_test)

In [None]:
print("Training Set Performance:")
print('-------------------------')
print("Mean Squared Error:", mse_train)
print("R-squared Score:", r2_train)

In [None]:
print("\nTest Set Performance:")
print('-----------------------')
print("Mean Squared Error:", mse_test)
print("R-squared Score:", r2_test)

In [None]:
plt.figure(figsize=(15,8))
plt.style.use('seaborn-darkgrid')
plt.bar(range(1,8),train_accuracy,color="blue")
plt.plot(range(1,8),train_accuracy,color="black")
plt.title('Training Accuracy',size =25)
plt.ylim(0.8,1)
plt.xticks(range(1,8))
plt.xlabel("Model",size=20)
plt.ylabel("Accuracy",size=20)
plt.show()

In [None]:
plt.figure(figsize=(15,8))
plt.style.use('seaborn-darkgrid')
plt.title('Testing Accuracy',size = 25)
plt.bar(range(1,8),test_accuracy,color="blue")
plt.plot(range(1,8),test_accuracy,color="black")
plt.ylim(0.8,1)
plt.xticks(range(1,8))
plt.xlabel("Model",size=20)
plt.ylabel("Accuracy",size=20)
plt.show()