# 支持向量机预测黑色星期五花销

In [1]:
#导入相关包
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 

In [2]:
# 导入数据
data=pd.read_csv("./data/BlackFriday.csv")
print("原始数据信息：")
data.info() 


原始数据信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   User_ID                     50000 non-null  int64  
 1   Product_ID                  50000 non-null  object 
 2   Gender                      50000 non-null  object 
 3   Age                         50000 non-null  object 
 4   Occupation                  50000 non-null  int64  
 5   City_Category               50000 non-null  object 
 6   Stay_In_Current_City_Years  50000 non-null  object 
 7   Marital_Status              50000 non-null  int64  
 8   Product_Category_1          50000 non-null  int64  
 9   Product_Category_2          34279 non-null  float64
 10  Product_Category_3          15183 non-null  float64
 11  Purchase                    50000 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 4.6+ MB


In [3]:
# 处理缺失数据

#婚姻状态只有3个缺失值，将其改为未婚状态
data['Marital_Status'].fillna(0, inplace = True)
print("\n填补婚姻状态的三个空值行")
data.info() 

#Product_Category_2 和Product_Category_3缺失信息过多，User_ID和Product_ID 对训练无用将其删去
data.drop(["Product_Category_2","Product_Category_3","User_ID","Product_ID"],axis=1,inplace=True)
print("\n删除无用列")
data.info() 


填补婚姻状态的三个空值行
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   User_ID                     50000 non-null  int64  
 1   Product_ID                  50000 non-null  object 
 2   Gender                      50000 non-null  object 
 3   Age                         50000 non-null  object 
 4   Occupation                  50000 non-null  int64  
 5   City_Category               50000 non-null  object 
 6   Stay_In_Current_City_Years  50000 non-null  object 
 7   Marital_Status              50000 non-null  int64  
 8   Product_Category_1          50000 non-null  int64  
 9   Product_Category_2          34279 non-null  float64
 10  Product_Category_3          15183 non-null  float64
 11  Purchase                    50000 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 4.6+ MB

删除无用列
<class 'pa

In [4]:
# 特征工程
data.describe(include="all")

# Occupation、Marital_Status和Product_Category_1 应该为类别数据
toObjFields=["Occupation","Marital_Status","Product_Category_1"]
data[toObjFields]=data[toObjFields].astype("object")
print("修改了Occupation、Marital_Status和Product_Category_1为obj类型")
data.info() 

# Stay_In_Current_City_Years 应该为数值类型数据
data["Stay_In_Current_City_Years"].replace("4+",4,inplace=True)
data["Stay_In_Current_City_Years"]=data["Stay_In_Current_City_Years"].astype("int")
print("\n修改了Stay_In_Current_City_Years为数值型")
data["Stay_In_Current_City_Years"].value_counts()

# 构造哑变量 
data=pd.get_dummies(data,drop_first=True)
# 这里采用独热码，但是对于独热码来说，一般都存在一个线性的推导关系，比如一个人的性别非女即男，也就是不需要两个列，只需要一个即可
print("\n用独热码构造哑变量")
data.head()

修改了Occupation、Marital_Status和Product_Category_1为obj类型
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Gender                      50000 non-null  object
 1   Age                         50000 non-null  object
 2   Occupation                  50000 non-null  object
 3   City_Category               50000 non-null  object
 4   Stay_In_Current_City_Years  50000 non-null  object
 5   Marital_Status              50000 non-null  object
 6   Product_Category_1          50000 non-null  object
 7   Purchase                    50000 non-null  int64 
dtypes: int64(1), object(7)
memory usage: 3.1+ MB

修改了Stay_In_Current_City_Years为数值型

用独热码构造哑变量


Unnamed: 0,Stay_In_Current_City_Years,Purchase,Gender_M,Age_18-25,Age_26-35,Age_36-45,Age_46-50,Age_51-55,Age_55+,Occupation_1,...,Product_Category_1_9,Product_Category_1_10,Product_Category_1_11,Product_Category_1_12,Product_Category_1_13,Product_Category_1_14,Product_Category_1_15,Product_Category_1_16,Product_Category_1_17,Product_Category_1_18
0,2,8370,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,15200,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1422,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,2,1057,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,4,7969,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# 分离自变量与因变量
Y=data["Purchase"]
Y.head()
X=data.drop(["Purchase"],axis=1)
X.head()

Unnamed: 0,Stay_In_Current_City_Years,Gender_M,Age_18-25,Age_26-35,Age_36-45,Age_46-50,Age_51-55,Age_55+,Occupation_1,Occupation_2,...,Product_Category_1_9,Product_Category_1_10,Product_Category_1_11,Product_Category_1_12,Product_Category_1_13,Product_Category_1_14,Product_Category_1_15,Product_Category_1_16,Product_Category_1_17,Product_Category_1_18
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,2,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,4,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# 分离训练集和测试集
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2)

In [7]:
# 特征缩放
scX=StandardScaler()
scY=StandardScaler()
x_train=scX.fit_transform(x_train)
x_test=scX.transform(x_test)
y_train=np.ravel(scY.fit_transform(y_train.values.reshape(-1,1)))
y_test=np.ravel(scY.fit_transform(y_test.values.reshape(-1,1)))
print("x_train",x_train.shape)
print("x_test",x_test.shape)
print("y_train",y_train.shape)
print("y_test",y_test.shape)

x_train (40000, 48)
x_test (10000, 48)
y_train (40000,)
y_test (10000,)


## 构造支持向量机
```python
sklearn.svm.SVR（kernel ='rbf'，degree = 3，gamma ='auto_deprecated'，coef0 = 0.0，tol = 0.001，C = 1.0，epsilon = 0.1，shrinking = True，cache_size = 200，verbose = False，max_iter = -1 ）
```
[api参数详解](https://blog.csdn.net/qq_24852439/article/details/85305317)
> kernel ： string，optional（default ='rbf'） 指定要在算法中使用的内核类型。它必须是'linear'，'poly'，'rbf'，'sigmoid'，'precomputed'或者callable之一。如果没有给出，将使用'rbf'。如果给出了callable，则它用于预先计算内核矩阵。
> 
> degree： int，可选（默认= 3）多项式核函数的次数（'poly'）。被所有其他内核忽略。
> 
> gamma ： float，optional（默认='auto'）'rbf'，'poly'和'sigmoid'的核系数。当前默认值为'auto'，它使用1 / n_features，如果gamma='scale'传递，则使用1 /（n_features * X.std（））作为gamma的值。当前默认的gamma''auto'将在版本0.22中更改为'scale'。'auto_deprecated'，'auto'的弃用版本用作默认值，表示没有传递明确的gamma值。
> 
> coef0 ： float，optional（默认值= 0.0）核函数中的独立项。它只在'poly'和'sigmoid'中很重要。
> 
> tol ： float，optional（默认值= 1e-3）容忍停止标准。
> 
> C ： float，可选（默认= 1.0） 错误术语的惩罚参数C.
> 
> epsilon ： float，optional（默认值= 0.1） Epsilon在epsilon-SVR模型中。它指定了epsilon-tube，其中训练损失函数中没有惩罚与在实际值的距离epsilon内预测的点。
> 
> shrinking ：收缩  布尔值，可选（默认= True）是否使用收缩启发式。
> 
> cache_size ： float，可选指定内核缓存的大小（以MB为单位）。
> 
> verbose ： 详细说明 bool，默认值：False 启用详细输出。请注意，此设置利用libsvm中的每进程运行时设置，如果启用，则可能无法在多线程上下文中正常运行。
> 
> max_iter ： int，optional（默认值= -1）求解器内迭代的硬限制，或无限制的-1

In [11]:
### 模型1 
from sklearn.svm import SVR
reg = SVR(kernel = 'rbf', gamma='scale', C=1.0, epsilon=0.1, verbose=True)
reg.fit(x_train, y_train)


[LibSVM]

SVR(verbose=True)

In [12]:
# 对测试集生成预测结果
y_pre=reg.predict(x_test)
# 进行r2评分
r2_score(y_test,y_pre)

0.6210842977410912

In [13]:
### 模型2
from sklearn.svm import SVR
reg = SVR(kernel = 'poly', gamma='scale', C=1.0, epsilon=0.1,)
reg.fit(x_train, y_train)


SVR(kernel='poly')

In [14]:
# 对测试集生成预测结果
y_pre=reg.predict(x_test)
# 进行r2评分
r2_score(y_test,y_pre)

0.6067921021586556

In [None]:
### 模型3
from sklearn.svm import SVR
reg = SVR(kernel = 'linear', gamma='scale', C=1.0, epsilon=0.1,)
reg.fit(x_train, y_train)

In [None]:
# 对测试集生成预测结果
y_pre=reg.predict(x_test)
# 进行r2评分
r2_score(y_test,y_pre)