In [1]:
import pandas as pd
import warnings

from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Perceptron, LinearRegression, LogisticRegression
from sklearn.naive_bayes import GaussianNB

warnings.filterwarnings('ignore')

In [2]:
path = '../../data/Perceptron/Analysis-Portfolio-Task-Data.csv'

df = pd.read_csv(path)
df.head()

Unnamed: 0,CustomerID,Shopping Basket,Gender,Age,Store Type,Value Products,Brand Products,Top Fresco Products
0,20358063,48.81,Male,26,Convenient Stores,8,2,1
1,24635139,33.44,Female,33,Superstore,6,5,1
2,27584479,131.57,Male,56,Online,35,8,12
3,28008212,20.02,Male,27,Convenient Stores,0,1,1
4,29130973,95.54,Female,55,Online,38,18,20


Chuẩn hóa data

In [3]:
df1 = df.drop(columns=['CustomerID'])

df1['Gender'] = df1['Gender'].map({'Male': 1, 'Female': 0})

df1

Unnamed: 0,Shopping Basket,Gender,Age,Store Type,Value Products,Brand Products,Top Fresco Products
0,48.81,1,26,Convenient Stores,8,2,1
1,33.44,0,33,Superstore,6,5,1
2,131.57,1,56,Online,35,8,12
3,20.02,1,27,Convenient Stores,0,1,1
4,95.54,0,55,Online,38,18,20
...,...,...,...,...,...,...,...
70,75.23,0,53,Superstore,9,14,5
71,97.21,0,43,Superstore,18,9,10
72,77.65,0,46,Superstore,10,10,9
73,128.67,0,48,Superstore,41,15,19


In [4]:
ohe = OneHotEncoder(sparse_output=False)

encoded_data = ohe.fit_transform(df1[['Store Type']])

encoded_df = pd.DataFrame(encoded_data, columns=ohe.get_feature_names_out(['Store Type']))

df2 = pd.concat([df1, encoded_df], axis=1)
df2.drop('Store Type', axis=1, inplace=True)

df2.head()

Unnamed: 0,Shopping Basket,Gender,Age,Value Products,Brand Products,Top Fresco Products,Store Type_Convenient Stores,Store Type_Online,Store Type_Superstore
0,48.81,1,26,8,2,1,1.0,0.0,0.0
1,33.44,0,33,6,5,1,0.0,0.0,1.0
2,131.57,1,56,35,8,12,0.0,1.0,0.0
3,20.02,1,27,0,1,1,1.0,0.0,0.0
4,95.54,0,55,38,18,20,0.0,1.0,0.0


In [5]:
X = df2.drop('Shopping Basket', axis=1)
y = df2['Shopping Basket']

Chia train, test. 60 dữ liệu đầu là tập train còn lại là tập test

In [7]:
X_train = X[:60]
y_train = y[:60]

X_test = X[60:]
y_test = y[60:]

Linear Regression

In [8]:
linR = LinearRegression()

linR.fit(X_train, y_train)

y_pred = linR.predict(X_test)

print('MSE:', mean_squared_error(y_test, y_pred))

MSE: 307.26306108613477


Hệ số của Hồi quy tuyến tính

In [9]:
print('Weights:', linR.coef_)
print('Bias:', linR.intercept_)

Weights: [-2.98339794  0.11344949  0.96418484  0.98721574  2.42324084 -8.06933149
  9.63391355 -1.56458206]
Bias: 16.48606819782274


Coi các bản ghi có Shopping Basket > 50 là thuộc lớp 1 và còn lại là class 0

In [10]:
df3 = df2.copy()

df3['Shopping Basket'] = df3['Shopping Basket'].apply(lambda x: 1 if x > 50 else 0)
df3.head()

Unnamed: 0,Shopping Basket,Gender,Age,Value Products,Brand Products,Top Fresco Products,Store Type_Convenient Stores,Store Type_Online,Store Type_Superstore
0,0,1,26,8,2,1,1.0,0.0,0.0
1,0,0,33,6,5,1,0.0,0.0,1.0
2,1,1,56,35,8,12,0.0,1.0,0.0
3,0,1,27,0,1,1,1.0,0.0,0.0
4,1,0,55,38,18,20,0.0,1.0,0.0


In [11]:
X_n = df3.drop('Shopping Basket', axis=1)
y_n = df3['Shopping Basket']

X_train_n = X_n[:60]
y_train_n = y_n[:60]

X_test_n = X_n[60:]
y_test_n = y_n[60:]

Khởi tạo Perceptron dùng sklearn

In [12]:
perceptron = Perceptron(max_iter=6000, eta0=0.01, tol=1e-6)
logR = LogisticRegression()
nb = GaussianNB()

In [18]:
perceptron.fit(X_train_n, y_train_n)
perceptron.score(X_test_n, y_test_n)

0.6

In [14]:
print('Weights:', perceptron.coef_)
print('Bias:', perceptron.intercept_)

Weights: [[-0.18 -0.89  0.88  1.47  0.23 -0.87  0.35 -0.14]]
Bias: [-0.66]


In [19]:
logR.fit(X_train_n, y_train_n)
logR.score(X_test_n, y_test_n)

1.0

In [16]:
print('Weights:', logR.coef_)
print('Bias:', logR.intercept_)

Weights: [[ 0.26228639  0.10098251  0.34728351  0.56089289 -0.08545311 -0.7031885
   0.43840208  0.26390166]]
Bias: [-11.0415277]


In [20]:
nb.fit(X_train_n, y_train_n)
nb.score(X_test_n, y_test_n)

0.8666666666666667

Độ chính xác của Logistic Regression là cao nhất (1.0) tiếp đến là Naive Bayes (0.866666667) và cuối cùng là Perceptron (0.6)

Perceptron có độ chính xác thấp vì dữ liệu không có dạng tuyến tính.