In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# 读取online_shoppers_intention.csv数据，完成数据预处理。
df = pd.read_csv('online_shoppers_intention.csv')
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [3]:
# 数据预处理
# 1. 删除重复数据
df.drop_duplicates(inplace=True)
# 2. 删除缺失值
df.dropna(inplace=True)
# 3. 删除无关特征
df.drop(['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration'], axis=1, inplace=True)
# 4. 删除异常值
df = df[df['Revenue'] != True]

# 5. 将特征转换为数值型
df['Month'] = df['Month'].map({'Jan': 1, 'Feb': 2, 'Mar': 3, 'May': 5, 'June': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12})
df['VisitorType'] = df['VisitorType'].map({'Returning_Visitor': 1, 'New_Visitor': 0, 'Other': 0})
df['Weekend'] = df['Weekend'].map({True: 1, False: 0})
df['Revenue'] = df['Revenue'].map({True: 1, False: 0})

# 6. 将特征和标签分开
X = df.drop('Revenue', axis=1)
y = df['Revenue']

# 7. 将数据集划分为训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# 8. 标准化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [4]:
df.head()

Unnamed: 0,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0.2,0.2,0.0,0.0,2,1,1,1,1,1,0,0
1,0.0,0.1,0.0,0.0,2,2,2,1,2,1,0,0
2,0.2,0.2,0.0,0.0,2,4,1,9,3,1,0,0
3,0.05,0.14,0.0,0.0,2,3,2,2,4,1,0,0
4,0.02,0.05,0.0,0.0,2,3,3,1,4,1,1,0


In [5]:
# 去除revenue字段
df.drop('Revenue', axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,0.2,0.2,0.0,0.0,2,1,1,1,1,1,0
1,0.0,0.1,0.0,0.0,2,2,2,1,2,1,0
2,0.2,0.2,0.0,0.0,2,4,1,9,3,1,0
3,0.05,0.14,0.0,0.0,2,3,2,2,4,1,0
4,0.02,0.05,0.0,0.0,2,3,3,1,4,1,1


In [7]:
# 查看其中有没有缺失值
df.isnull().sum()
#查看其中有没有nan
df.isna().sum()

BounceRates         0
ExitRates           0
PageValues          0
SpecialDay          0
Month               0
OperatingSystems    0
Browser             0
Region              0
TrafficType         0
VisitorType         0
Weekend             0
dtype: int64

In [8]:

# 筛选有价值字段
# 卡方检验
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# 选择K个最好的特征，返回选择特征后的数据
X_chi2 = SelectKBest(chi2, k=5).fit_transform(df, y)
X_chi2.shape

# 查看选择的字段
X_chi2


array([[ 1.,  1.,  1.,  1.,  0.],
       [ 2.,  1.,  2.,  1.,  0.],
       [ 1.,  9.,  3.,  1.,  0.],
       ...,
       [ 2.,  1., 13.,  1.,  1.],
       [ 2.,  3., 11.,  1.,  0.],
       [ 2.,  1.,  2.,  0.,  1.]])

In [None]:
#去除掉没有被选择的字段
df.drop(['OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType'], axis=1, inplace=True)

# 查看数据
df.head()

In [11]:
# 采用两种方法进行聚类
# 1. KMeans
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2)
kmeans.fit(df)
y_pred = kmeans.predict(df)


: 

: 

In [None]:
# 2. DBSCAN
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan.fit(df)
y_pred = dbscan.labels_