# 一、转换器

In [7]:
from sklearn.preprocessing import StandardScaler

In [2]:
transfer = StandardScaler()
a = [[1,2,3],[4,5,6]]

In [4]:
transfer.fit_transform(a)

array([[-1., -1., -1.],
       [ 1.,  1.,  1.]])

In [5]:
transfer.fit(a)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [6]:
transfer.transform(a)

array([[-1., -1., -1.],
       [ 1.,  1.,  1.]])

# 二、K-近邻算法 - 鸢尾花种类预测

In [28]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [29]:
iris = load_iris()

In [30]:
# 划分数据集
x_train,x_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.3,random_state=8)

In [31]:
# 标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

In [32]:
# 训练模型
estimator = KNeighborsClassifier()
estimator.fit(x_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [33]:
# 模型评估
# 方法一 比对真实值与预测值
y_predict = estimator.predict(x_test)
y_test == y_predict

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True])

In [34]:
# 模型评估
# 方法二 计算准确率
estimator.score(x_test,y_test)

0.9333333333333333

# 三、鸢尾花案例增加K值调优

In [36]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

# 加载数据
iris = load_iris()

# 划分数据集
x_train,x_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.3,random_state=8)

# 标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

# 指定算法及模型选择与调优——网格搜索和交叉验证
estimator = KNeighborsClassifier()
param_dict = {"n_neighbors": [1, 3, 5]}
estimator = GridSearchCV(estimator, param_grid=param_dict, cv=3)

# 训练模型
estimator.fit(x_train,y_train)

# 模型评估
# 方法一 比对真实值与预测值
y_predict = estimator.predict(x_test)
y_test == y_predict
# 方法二 计算准确率
estimator.score(x_test,y_test)

# 然后进行评估查看最终选择的结果和交叉验证的结果
print("在交叉验证中验证的最好结果：\n", estimator.best_score_)
print("最好的参数模型：\n", estimator.best_estimator_)
print("每次交叉验证后的准确率结果：\n", estimator.cv_results_)

在交叉验证中验证的最好结果：
 0.9714285714285714
最好的参数模型：
 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')
每次交叉验证后的准确率结果：
 {'mean_fit_time': array([0.00033243, 0.00033307, 0.00033299]), 'std_fit_time': array([0.00047013, 0.00047103, 0.00047092]), 'mean_score_time': array([0.00066598, 0.00133244, 0.00066622]), 'std_score_time': array([0.00047092, 0.00047143, 0.00047109]), 'param_n_neighbors': masked_array(data=[1, 3, 5],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_neighbors': 1}, {'n_neighbors': 3}, {'n_neighbors': 5}], 'split0_test_score': array([0.97222222, 0.91666667, 0.91666667]), 'split1_test_score': array([1., 1., 1.]), 'split2_test_score': array([0.94117647, 0.91176471, 0.94117647]), 'mean_test_score': array([0.97142857, 0.94285714, 0.95238095]), 'std_test_score': array([0.02384684, 0.04045559, 0.03512587]), 'rank_test

# 四、案例：预测facebook签到位置

In [12]:
import pandas as pd
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
#读取数据
facebook=pd.read_csv("./data/FBlocation/train.csv")
facebook.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
0,0,0.7941,9.0809,54,470702,8523065625
1,1,5.9567,4.7968,13,186555,1757726713
2,2,8.3078,7.0407,74,322648,1137537235
3,3,7.3665,2.5165,65,704587,6567393236
4,4,4.0961,1.1307,31,472130,7440663949


In [None]:
# 数据预处理
# 1> 缩小数据集范围
facebook = facebook.query("x<1.5&x>1.25&y>2.25&y<2.5")
# 2> 时间特征提取
time_value = pd.to_datetime(facebook['time'],unit='s')
time_value = pd.DatetimeIndex(time_value)
facebook['day'] = time_value.day
facebook['hour'] = time_value.hour
facebook['weekday'] = time_value.weekday
# 3> 删除签到数少于n的位置
place_count = facebook.groupby(['place_id']).count()
place_count = place_count.query('row_id>3')
facebook = facebook[facebook['place_id'].isin(place_count.index)]

In [9]:
# 数据集划分
# 1> 拿取有用的特征数据
x=facebook[['x','y','accuracy','day','hour','weekday']]
# 2> 拿取目标值数据
y=facebook['place_id']
# 3> 数据集划分
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=8)

In [13]:
# 特征工程：标准化
# 1> 创建转换器
transfer = StandardScaler()
# 2> 计算并标准化训练集数据
x_train = transfer.fit_transform(x_train)
# 3> 计算并标准化测试集数据
x_test = transfer.transform(x_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  import sys


In [None]:
# 模型训练及参数优化
# 1> 实例化一个K-近邻估计器
estimator = KNeighborsClassifier()
# 2> 运用网络搜索参数优化KNN算法
param_dict = {"n_neighbors":[3,5,7,9]}  # K-近邻中分别选取这几个 K 值，最终经过交叉验证会返回各个取值的结果和最好的结果
estimator = GridSearchCV(estimator,param_grid=param_dict,cv=5)  # 返回优化后的估计器
# 3> 传入训练集，进行机器学习
estimator.fit(x_train,y_train)

In [None]:
# 模型评估
# 方法一：比较真实值与预测值
y_predict=estimator.predict(x_test)
print("预测值为:\n",y_predict)
print("比较真实值与预测值结果为:\n",y_predict==y_test)
# 方法二：计算模型准确率
print("模型准确率为:\n",estimator.score(x_test,y_test))
print("在交叉验证中最的结果:\n",estimator.best_score_)
print("最好的参数模型:\n",estimator.best_estimator_)
print("每次交叉验证后的结果准确率为/n",estimator.cv_results_)

# x、案例-朴素贝叶斯-20类新闻分类

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB


# 获取数据
news = fetch_20newsgroups()

# 划分数据集
x_train, x_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.3)

# 特征抽取 Tfidf
# 实例化一个转换器
transfer = TfidfVectorizer()
x_train = transfer.fit_transform(x_train)
# 必须使用transfrom因为要让测试数据和训练数据的特征值是一样的。
x_test = transfer.transform(x_test)

# 模型训练
# 实例化一个估计器
estimator = MultinomialNB()
estimator.fit(x_train, y_train)

#  模型评估
# 方法一：比较真实值与预测值
y_predict = estimator.predict(x_test)
print('预测值为:\n', y_predict)
print('比较真实值与预测值结果为:\n', y_predict==y_test)

# 方法二：计算模型准确率
print('模型准确率为:\n', estimator.score(x_test, y_test))

# 案例 - 决策树 - 决策树用于iris分类演示

In [6]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [7]:
# 获取数据集
iris=load_iris()

In [8]:
# 分割数据集
x_train,x_test,y_train,y_test=train_test_split(iris.data,iris.target,test_size=0.3,random_state=8)

In [9]:
# 特征工程：标准化
transfer=StandardScaler()
x_train=transfer.fit_transform(x_train)
x_test=transfer.transform(x_test)

In [10]:
# 模型训练
# 1> 实例化一个估计器
estimator=DecisionTreeClassifier(criterion='entropy',max_depth=3)
# 2> 传入训练数据集，进行机器学习
estimator.fit(x_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [11]:
# 模型评估
# 方法1，比较真实值与预测值
y_predict=estimator.predict(x_test)
print("预测值为:\n",y_predict)
print("比较真实值与预测值结果为:\n",y_predict==y_test)
# 方法2, 计算模型准确率
print("模型准确率为:\n",estimator.score(x_test,y_test))

预测值为:
 [0 0 0 2 1 0 0 2 2 1 1 0 1 1 1 2 2 2 2 1 1 0 1 1 1 0 2 0 0 2 0 0 0 2 1 1 1
 1 0 1 1 0 1 1 2]
比较真实值与预测值结果为:
 [ True  True  True  True  True  True  True  True  True  True  True  True
 False  True False  True  True  True False False  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
 False  True  True  True  True  True  True  True  True]
模型准确率为:
 0.8888888888888888


# 案例 - 决策树：泰坦尼克号乘客生存预测¶

In [12]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [13]:
# 获取数据
tanic=pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt")
tanic.head()

Unnamed: 0,row.names,pclass,survived,name,age,embarked,home.dest,room,ticket,boat,sex
0,1,1st,1,"Allen, Miss Elisabeth Walton",29.0,Southampton,"St Louis, MO",B-5,24160 L221,2,female
1,2,1st,0,"Allison, Miss Helen Loraine",2.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
2,3,1st,0,"Allison, Mr Hudson Joshua Creighton",30.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,(135),male
3,4,1st,0,"Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)",25.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
4,5,1st,1,"Allison, Master Hudson Trevor",0.9167,Southampton,"Montreal, PQ / Chesterville, ON",C22,,11,male


In [14]:
# 数据预处理，填充缺失值
tanic['age'].fillna(tanic['age'].mean(),inplace=True)

In [15]:
#提取特征值，目标值
x=tanic[['pclass','age','sex']]
y=tanic['survived']

In [16]:
x.head()

Unnamed: 0,pclass,age,sex
0,1st,29.0,female
1,1st,2.0,female
2,1st,30.0,male
3,1st,25.0,female
4,1st,0.9167,male


In [17]:
y.head()

0    1
1    0
2    0
3    0
4    1
Name: survived, dtype: int64

In [18]:
# 特征工程，字典特征提取
transfer=DictVectorizer(sparse=False)
x=transfer.fit_transform(x.to_dict(orient="records"))

In [19]:
# 数据集划分
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)

In [20]:
# 模型训练
# 1> 实例化一个转换器
estimator=DecisionTreeClassifier()
# 2> 进行机器学习
estimator.fit(x_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [21]:
# 模型评估
# 方法1，比较真实值与预测值
y_predict=estimator.predict(x_test)
print("预测值为:\n",y_predict)
print("比较真实值与预测值结果为:\n",y_predict==y_test)
# 方法2,计算模型准确率
print("模型准确率为:\n",estimator.score(x_test,y_test))

预测值为:
 [0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1
 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0
 0 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 1 1 0 0
 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0
 1 1 1 1 0 1 0 0 1 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1
 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 1 0 1 0 1 0 0 0 1 1 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 1 1 0 0 0]
比较真实值与预测值结果为:
 440      True
1153     True
237      True
121     False
612      True
753      True
987      True
901      True
705      True
99       True
217      True
623      True
68       T

In [22]:
# 查询提取的特征名称
transfer.get_feature_names()

['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'sex=female', 'sex=male']

In [23]:
# 查看提取的特征数据
x

array([[29.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [ 2.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [30.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       ...,
       [31.19418104,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ],
       [31.19418104,  0.        ,  0.        ,  1.        ,  1.        ,
         0.        ],
       [31.19418104,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ]])

# 案例 - 随机森林 - 随机森林预测tanic生存状况¶

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

  from numpy.core.umath_tests import inner1d


In [25]:
# 1> 实例化一个估计器
estimator=RandomForestClassifier()

In [26]:
# 2> 网格搜索优化随机森林模型
param_dict={"n_estimators":[120,200,300,500,800,1200],"max_depth":[5,8,15,25,30]}
estimator=GridSearchCV(estimator,param_grid=param_dict,cv=5)

In [None]:
# 3> 传入训练集，进行模型训练
estimator.fit(x_train,y_train)

In [None]:
# 4> 模型评估
# 方法1，比较真实值与预测值
y_predict=estimator.predict(x_test)
print("预测值为:\n",y_predict)
print("比较真实值与预测值结果为:\n",y_predict==y_test)
# 方法2,计算模型准确率
print("模型准确率为:\n",estimator.score(x_test,y_test))
print("在交叉验证中最的结果:\n",estimator.best_score_)
print("最好的参数模型:\n",estimator.best_estimator_)
print("每次交叉验证后的结果准确率为/n",estimator.cv_results_)