In [None]:
# Version 1. Descriptive Analysis
from google.colab import files
uploaded = files.upload()
import pandas as pd
df = pd.read_csv('Sample - Superstore.csv', encoding='latin1')
df.info()
df.shape
df.isna().sum()
df.describe()

# Chuyển Dtype của Order Date & Ship Date
df['Order Date'] = pd.to_datetime(df['Order Date'])
df['Ship Date'] = pd.to_datetime(df['Ship Date'])
print(df.info())

# Check outlier Sales, Profit
# Kết quả đối với Sales có tới 5 đơn (> 10000), khả năng cao là đúng dữ liệu --> Giữ nhưng khi visualize sẽ dùng log scale hoặc remove outlier mode nếu biểu đồ lệch
# Kết quả đối với Profit chỉ có 1 đơn (< -5000) --> Loại để tránh ảnh hưởng dữ liệu
print(df[df['Sales'] > 10000])
print(df[df['Profit'] < - 1000])

# Xóa outlier ở Profit
df.drop(df[df['Profit'] < - 1000].index, inplace= True)
print(df.describe())
print("Đơn hàng Profit < -1000 còn lại:", df[df['Profit'] < -1000].shape[0])

# Download data clean
df.to_csv('Sample - Superstore_clean.csv')
from google.colab import files
files.download('Sample - Superstore_clean.csv')

Saving Sample - Superstore.csv to Sample - Superstore (22).csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Row ID         9994 non-null   int64  
 1   Order ID       9994 non-null   object 
 2   Order Date     9994 non-null   object 
 3   Ship Date      9994 non-null   object 
 4   Ship Mode      9994 non-null   object 
 5   Customer ID    9994 non-null   object 
 6   Customer Name  9994 non-null   object 
 7   Segment        9994 non-null   object 
 8   Country        9994 non-null   object 
 9   City           9994 non-null   object 
 10  State          9994 non-null   object 
 11  Postal Code    9994 non-null   int64  
 12  Region         9994 non-null   object 
 13  Product ID     9994 non-null   object 
 14  Category       9994 non-null   object 
 15  Sub-Category   9994 non-null   object 
 16  Product Name   9994 non-null   ob

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
# Version 2. Predictive Analysis
from google.colab import files
uploaded = files.upload()
import pandas as pd
df = pd.read_csv('Sample - Superstore_clean.csv', encoding='latin1', parse_dates=['Order Date', 'Ship Date'], index_col=0)
df.shape
df.info()
df.head()

# Chọn biến đầu vào (features) & biến mục tiêu (Profit)
target = 'Profit'
drop_cols = ['Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Postal Code', 'State', 'City', 'Country', 'Customer Name', 'Customer ID', 'Product ID', 'Product Name', 'Sub-Category','Profit']
x = df.drop(columns= drop_cols, errors='ignore')
y = df[target]
x.columns.tolist()

# Mã hóa biến categorical
## Kiểm tra số lượng biến cho các Cột object
cat_cols = x.select_dtypes(include='object').columns.tolist()
cat_cols
x[cat_cols].nunique().sort_values(ascending=False)
## Onehot Encode
x_encoded = pd.get_dummies(x, columns = cat_cols, drop_first=True)
x_encoded.shape
## Kiểm tra còn NA không (Vì ML không chấp nhận giá trị NaN)
x_encoded.isna().sum()

# Tách Train/Test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test =train_test_split(x_encoded, y, test_size=0.2, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

# Train bằng Linear Regression trước tiên
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)
y_pred = lr_model.predict(x_test)
MAE_lr = mean_absolute_error(y_test, y_pred)
RMSE_lr = np.sqrt(mean_squared_error(y_test, y_pred))
r2_lr = r2_score(y_test, y_pred)
print('MAE lr:', MAE_lr)
print('RMSE lr:', RMSE_lr)
print('R2 Score lr:', r2_lr)

# Train tiếp bằng Random Forest
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(x_train, y_train)
y_pred_rf = rf_model.predict(x_test)
MAE_rf = mean_absolute_error(y_test, y_pred_rf)
RMSE_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)
print('MAE rf:', MAE_rf)
print('RMSE rf:', RMSE_rf)
print('R2 Score rf:', r2_rf)
## Xem Feature Importances
feature_importances = pd.Series(rf_model.feature_importances_, index = x_train.columns)
feature_importances = feature_importances.sort_values(ascending=False)
print('\n Top 10 Feature Importances:')
print(feature_importances.head(10))

# Lưu kết quả tĩnh (metrics, feature importances & visualize lên file Power BI đã tạo ở phần Descriptive Analysis trước đó)
metrics = pd.DataFrame({'Model' : ['Linear Regression', 'Random Forest'], 'MAE' : [MAE_lr, MAE_rf], 'RMSE' : [RMSE_lr, RMSE_rf], 'R2 Score' : [r2_lr, r2_rf]})
metrics.to_csv('metrics_ver2.csv', index=False)
fi = feature_importances.reset_index()
fi.columns = ['Feature', 'Importances']
fi.to_csv('feature_importances_ver2.csv', index=False)
from google.colab import files
files.download('metrics_ver2.csv')
files.download('feature_importances_ver2.csv')

Saving Sample - Superstore_clean.csv to Sample - Superstore_clean (13).csv
<class 'pandas.core.frame.DataFrame'>
Index: 9972 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Row ID         9972 non-null   int64         
 1   Order ID       9972 non-null   object        
 2   Order Date     9972 non-null   datetime64[ns]
 3   Ship Date      9972 non-null   datetime64[ns]
 4   Ship Mode      9972 non-null   object        
 5   Customer ID    9972 non-null   object        
 6   Customer Name  9972 non-null   object        
 7   Segment        9972 non-null   object        
 8   Country        9972 non-null   object        
 9   City           9972 non-null   object        
 10  State          9972 non-null   object        
 11  Postal Code    9972 non-null   int64         
 12  Region         9972 non-null   object        
 13  Product ID     9972 non-null   object        
 14  Ca

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>