In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
%matplotlib inline
data = pd.read_csv('Amazon Sales data.csv')

In [None]:
data

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
data.columns

In [None]:
data['Ship Date'] = pd.to_datetime(data['Ship Date'])
data['Order Date'] = pd.to_datetime(data['Order Date'])

In [None]:
data

In [None]:
data['Region'] = data['Region'].astype(str)
data['Country'] = data['Country'].astype(str)
data['Item Type'] = data['Item Type'].astype(str)
data['Sales Channel'] = data['Sales Channel'].astype(str)
data['Order Priority'] = data['Order Priority'].astype(str)

In [None]:
data[['Units Sold', 'Unit Price',	'Unit Cost', 'Total Revenue', 'Total Cost',	'Total Profit']].describe()

In [None]:
data['Order Month'] = data['Order Date'].dt.month
data['Order Year'] = data['Order Date'].dt.year
data['Order Date MonthYear'] = data['Order Date'].dt.strftime('%Y-%m')
data = data.drop(columns=['Order Date'])

In [None]:
data

In [None]:
sales_trend_month = data.groupby('Order Month')['Total Profit'].sum()

In [None]:
sales_trend_month

In [None]:
sales_trend_month.plot(kind='bar', x='Order Month', y='Total Profit')

In [None]:
sales_trend_year = data.groupby('Order Year')['Total Profit'].sum()

In [None]:
sales_trend_year

In [None]:
sales_trend_year.plot(kind='bar', x='Order Year', y='Total Profit')

In [None]:
sales_trend_yearmonth = data.groupby('Order Date MonthYear')['Total Profit'].sum()

In [None]:
sales_trend_yearmonth

In [None]:
plt.figure(figsize=(16, 6))
sales_trend_yearmonth.plot(kind='bar', x='Order Date MonthYear', y='Total Profit')
plt.title('YearMonthSales Sales Trend')
plt.xlabel('Order MonthYear')
plt.ylabel('Total Profit')
plt.show()


In [None]:
pd.set_option('display.max_rows', None)
data['Country'].value_counts()

In [None]:
country_names = data.Country.value_counts().index
country_val = data.Country.value_counts().values
fig,ax = plt.subplots(figsize=(9,9))
ax.pie(country_val[:15],labels=country_names[:15],autopct='%1.2f%%')
plt.show()

In [None]:
fig = px.choropleth(data_frame = data,
                    locations = 'Country',
                    locationmode = 'country names',
                    color = 'Total Profit',
                    title = 'Total Profit by Country',
                    color_continuous_scale='Viridis'
                    )
fig.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['Total Profit'])
plt.xlabel('Total Profit')
plt.ylabel('Value')
plt.title('Box Plot of Total Profit')
plt.show()

In [None]:
def detect_outliers(dataframe, column_name):
    if column_name not in dataframe.columns:
        raise ValueError(f"Column '{column_name}' does not exist in the dataframe.")


    column = dataframe[column_name].values


    mean = np.mean(column)
    std = np.std(column)


    threshold = 2


    outliers = []


    for i, value in enumerate(column):
        z_score = (value - mean) / std
        if np.abs(z_score) > threshold:
            outliers.append(i)


    outlier_rows = dataframe.iloc[outliers]

    return outliers, outlier_rows

In [None]:
outliers1 = detect_outliers(data, 'Total Profit')
outliers1

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['Total Cost'])
plt.xlabel('Total Cost')
plt.ylabel('Value')
plt.title('Box Plot of Total Profit')
plt.show()

In [None]:
outliers2 = detect_outliers(data, 'Total Cost')
outliers2

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['Total Revenue'])
plt.title('Box Plot of Total Revenue')
plt.xlabel('Total Revenue')
plt.ylabel('Value')
plt.show()

In [None]:
outliers3 = detect_outliers(data, 'Total Revenue')
outliers3

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['Unit Cost'])
plt.title('Box Plot of Unit Cost')
plt.xlabel('Unit Cost')
plt.ylabel('Value')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['Unit Price'])
plt.title('Box Plot of Unit Price')
plt.xlabel('Unit Price')
plt.ylabel('Value')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['Units Sold'])
plt.title('Box Plot of Units Sold')
plt.xlabel('Units Sold')
plt.ylabel('Value')
plt.show()

In [None]:
revenue_by_category = data.groupby('Item Type')['Total Revenue'].sum().sort_values(ascending=False)
revenue_by_category

In [None]:
profit_by_category = data.groupby('Item Type')['Total Profit'].sum().sort_values(ascending=False)
profit_by_category

In [None]:
plt.figure(figsize=(16, 6))
profit_by_category.plot(kind='bar')
plt.title('Profit by Item Type')
plt.xlabel('Item Type')
plt.ylabel('Total Profit')
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(16, 6))
profit_by_category.plot(kind='line', marker='o')
plt.title('Profit by Item Type')
plt.xlabel('Item Type')
plt.ylabel('Total Profit')
plt.xticks(rotation=45)
plt.show()

In [None]:
corr_matrix = data[['Total Revenue', 'Total Cost', 'Total Profit']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
data

In [None]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
data["Item Type"] = label.fit_transform(data["Item Type"])
data["Sales Channel"] = label.fit_transform(data["Sales Channel"])
data["Order Priority"] = label.fit_transform(data["Order Priority"])

In [None]:
data

In [None]:
df = pd.DataFrame(data)

In [None]:
df = df.drop("Region", axis=1)
df = df.drop("Country", axis=1)
df = df.drop("Order Date MonthYear", axis=1)
df = df.drop("Order ID", axis=1)
df = df.drop("Ship Date", axis=1)

In [None]:
df

In [None]:
df.head(5)

In [None]:
df.tail(10)

In [None]:
df_auto_ml = df

In [None]:
!pip install --pre pycaret

In [None]:
from pycaret.regression import *

In [None]:
reg = setup(data = df_auto_ml, target = 'Total Profit', session_id=123)
best_model = compare_models()

In [None]:
llar = create_model('llar')


In [None]:
tuned_llar_model = tune_model(llar)

In [None]:
plot_model(tuned_llar_model)

In [None]:
plot_model(tuned_llar_model, plot="error")

In [None]:
predict_model(tuned_llar_model)

In [None]:
X = df[['Item Type', 'Sales Channel', 'Order Priority', 'Units Sold', 'Unit Price', 'Unit Cost', 'Total Revenue', 'Total Cost', 'Order Month', 'Order Year']]
y = df['Total Profit']

In [None]:
X

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
X_train = scaler.fit_transform(X_train)
X_train

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
regression = LinearRegression()
regression.fit(X_train,y_train)

In [None]:
mse = cross_val_score(regression,X_train,y_train,scoring="neg_mean_squared_error",cv=5)
mse = np.sqrt(-mse)
mse

In [None]:
mse.mean()

In [None]:
reg_pred = regression.predict(X_test)

In [None]:
reg_pred

In [None]:
sns.displot(reg_pred.flatten() - y_test,kind='kde', height=5, aspect=2)
plt.show()


In [None]:
sns.displot(reg_pred.flatten() - y_test,kind='ecdf', height=5, aspect=2)
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.scatter(y_test, reg_pred)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()

In [None]:
import plotly.graph_objs as go
import pandas as pd
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
r2 = reg.score(X_test, y_test)
X_train_df = pd.DataFrame(X_train, columns=['Item Type', 'Sales Channel', 'Order Priority', 'Units Sold', 'Unit Price', 'Unit Cost', 'Total Revenue', 'Total Cost', 'Order Month', 'Order Year'])
coef = reg.coef_
feature_importances = pd.DataFrame({'Feature': X_train_df.columns, 'Importance': coef.flatten()}) #Flatten coef to make it a 1D array
feature_importances = feature_importances.sort_values('Importance', ascending=False)
fig = go.Figure()
fig.add_trace(go.Bar(x=feature_importances['Feature'], y=feature_importances['Importance']))
fig.update_layout(title='Feature Importance (R-squared = {:.2f})'.format(r2),
                  xaxis_title='Feature',
                  yaxis_title='Importance',
                  xaxis_tickangle=-45)
fig.show()
