In [130]:
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris,fetch_20newsgroups,\
load_diabetes,load_boston,load_digits
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

## 1、练习特征预处理中的PCA

In [4]:
data = [[1,2,3],[4,5,6],[6,7,8],[1,8,9]]
pca = PCA(n_components=3) # 整数可以指定降到的特征数量
res = pca.fit_transform(data)
print(res)

[[ 5.32302226e+00  4.06735761e-01  1.11238986e-16]
 [ 3.70981892e-01 -1.16720711e+00  2.46635974e-17]
 [-2.93037836e+00 -2.21650236e+00  1.23606899e-16]
 [-2.76362580e+00  2.97697370e+00  8.65031610e-17]]


In [6]:
data = [[1,2,3],[4,5,6],[6,7,8],[1,8,9]]
pca = PCA(n_components=0.9)  # 小数是指定主成分所占比例，由pca类自动决定特征数量
res = pca.fit_transform(data)
print(res)

[[ 5.32302226  0.40673576]
 [ 0.37098189 -1.16720711]
 [-2.93037836 -2.21650236]
 [-2.7636258   2.9769737 ]]


## 2、完成对数据集的data，target，DESCR，feature_names，target_names的理解
* 鸢尾花数据集
* 数字数据集
* 新闻分类数据集
* 波士顿房价-回归数据集
* 糖尿病-回归数据集

In [17]:
li = load_iris()
print(type(li.data))
print(li.data.shape) # 数据集形状
print(li.target.shape) # 目标值形状
print('-'*30)
print(li.feature_names) # 特征值名称
print(li.target_names)  # 目标值名称
print('-' * 30)
print(li.DESCR) # 数据集介绍

<class 'numpy.ndarray'>
(150, 4)
(150,)
------------------------------
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
['setosa' 'versicolor' 'virginica']
------------------------------
.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1

In [23]:
# 划分鸢尾花的训练集和测试集
# 选择一个随机种子，保证每次随机划分数据集的策略是一致的
x_train, x_test, y_train,y_test = train_test_split(
    li.data,li.target,test_size=0.25,random_state=1)
print(x_train.shape)  # 训练集形状
print(y_train.shape)  # 训练目标形状
print(x_test.shape)  # 测试集形状
print(y_test.shape)  # 测试标签形状

(112, 4)
(112,)
(38, 4)
(38,)


In [27]:
# 数字数据集
dig = load_digits()
print(dig.data.shape)
print(dig.target.shape)
print(dig.feature_names)
print(dig.target_names)

(1797, 64)
(1797,)
['pixel_0_0', 'pixel_0_1', 'pixel_0_2', 'pixel_0_3', 'pixel_0_4', 'pixel_0_5', 'pixel_0_6', 'pixel_0_7', 'pixel_1_0', 'pixel_1_1', 'pixel_1_2', 'pixel_1_3', 'pixel_1_4', 'pixel_1_5', 'pixel_1_6', 'pixel_1_7', 'pixel_2_0', 'pixel_2_1', 'pixel_2_2', 'pixel_2_3', 'pixel_2_4', 'pixel_2_5', 'pixel_2_6', 'pixel_2_7', 'pixel_3_0', 'pixel_3_1', 'pixel_3_2', 'pixel_3_3', 'pixel_3_4', 'pixel_3_5', 'pixel_3_6', 'pixel_3_7', 'pixel_4_0', 'pixel_4_1', 'pixel_4_2', 'pixel_4_3', 'pixel_4_4', 'pixel_4_5', 'pixel_4_6', 'pixel_4_7', 'pixel_5_0', 'pixel_5_1', 'pixel_5_2', 'pixel_5_3', 'pixel_5_4', 'pixel_5_5', 'pixel_5_6', 'pixel_5_7', 'pixel_6_0', 'pixel_6_1', 'pixel_6_2', 'pixel_6_3', 'pixel_6_4', 'pixel_6_5', 'pixel_6_6', 'pixel_6_7', 'pixel_7_0', 'pixel_7_1', 'pixel_7_2', 'pixel_7_3', 'pixel_7_4', 'pixel_7_5', 'pixel_7_6', 'pixel_7_7']
[0 1 2 3 4 5 6 7 8 9]


In [51]:
# 对数字数据集进行训练集和测试集的划分
x_train, x_test, y_tain,y_test = train_test_split(
    dig.data,dig.target,test_size=0.2,random_state=2)

In [52]:
print(x_train.shape)
print(y_tain.shape)
print(x_test.shape)
print(y_test.shape)

(1437, 64)
(1437,)
(360, 64)
(360,)


In [37]:
# 20新闻分类数据集
# 指定数据集的位置，没有就会下载到此，有则直接加载
news = fetch_20newsgroups(subset='all',data_home='../data')
print(news.data[1])  # 查看第一条数据
print(type(news.data[1]))  # 字符串格式
print('-'*30 )
print(news.target.shape) # 共有18846条数数据
print(news.target_names)
print('-'*30 )
print(news.DESCR)  # 数据集介绍，共20个类别

From: mblawson@midway.ecn.uoknor.edu (Matthew B Lawson)
Subject: Which high-performance VLB video card?
Summary: Seek recommendations for VLB video card
Nntp-Posting-Host: midway.ecn.uoknor.edu
Organization: Engineering Computer Network, University of Oklahoma, Norman, OK, USA
Keywords: orchid, stealth, vlb
Lines: 21

  My brother is in the market for a high-performance video card that supports
VESA local bus with 1-2MB RAM.  Does anyone have suggestions/ideas on:

  - Diamond Stealth Pro Local Bus

  - Orchid Farenheit 1280

  - ATI Graphics Ultra Pro

  - Any other high-performance VLB card


Please post or email.  Thank you!

  - Matt

-- 
    |  Matthew B. Lawson <------------> (mblawson@essex.ecn.uoknor.edu)  |   
  --+-- "Now I, Nebuchadnezzar, praise and exalt and glorify the King  --+-- 
    |   of heaven, because everything he does is right and all his ways  |   
    |   are just." - Nebuchadnezzar, king of Babylon, 562 B.C.           |   

<class 'str'>
----------------------

In [42]:
# 回归数据集，波士顿房价
# 回归数据集的目标值是数值，没有目标名称
boston = load_boston()
print(boston.data.shape)
print(boston.target.shape)
print(boston.feature_names)
print('-'*30)
print(boston.DESCR)

(506, 13)
(506,)
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
------------------------------
.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
       


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

In [50]:
# 回归数据集，糖尿病
dia = load_diabetes()
print(dia.data.shape)
print(dia.target.shape)
print(dia.feature_names)
print('-'*30)
print(dia.DESCR)

(442, 10)
(442,)
['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
------------------------------
.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5  

## 3、完成对facebook数据的k近邻的机器学习，并通过网格搜索实现最近neighbors

In [54]:
# 加载数据集
data = pd.read_csv("../data/FBlocation/train.csv")

In [58]:
# 查看数据大致情况
print(data.head())
print(data.shape)

   row_id       x       y  accuracy    time    place_id
0       0  0.7941  9.0809        54  470702  8523065625
1       1  5.9567  4.7968        13  186555  1757726713
2       2  8.3078  7.0407        74  322648  1137537235
3       3  7.3665  2.5165        65  704587  6567393236
4       4  4.0961  1.1307        31  472130  7440663949
(29118021, 6)


In [59]:
# 由于数据量太大，所以适当缩小数据的范围，加快计算
data = data.query("x>2.0 & x<2.5 & y > 2.0 & y < 2.5")

In [65]:
print(data.shape)
# 由于time是字符串形式，不能从中看出特征
# 所以把time转换成年月日的形式，方便看出特征
time = pd.to_datetime(data.time,unit='s')
print(time)
# 把时间转换成字典格式
time = pd.DatetimeIndex(time)

(71664, 6)
163        1970-01-08 18:02:17
310        1970-01-03 17:11:59
658        1970-01-06 19:32:23
1368       1970-01-04 16:50:22
1627       1970-01-07 21:18:04
                   ...        
29116142   1970-01-03 12:31:26
29116267   1970-01-04 15:19:20
29116295   1970-01-01 20:49:14
29116475   1970-01-03 09:17:37
29117203   1970-01-02 20:34:43
Name: time, Length: 71664, dtype: datetime64[ns]


In [64]:
print(time)

DatetimeIndex(['1970-01-08 18:02:17', '1970-01-03 17:11:59',
               '1970-01-06 19:32:23', '1970-01-04 16:50:22',
               '1970-01-07 21:18:04', '1970-01-02 03:14:59',
               '1970-01-07 03:45:16', '1970-01-05 03:28:43',
               '1970-01-01 18:59:03', '1970-01-09 07:50:12',
               ...
               '1970-01-09 20:03:34', '1970-01-08 09:26:50',
               '1970-01-07 04:45:59', '1970-01-07 22:36:18',
               '1970-01-06 23:29:43', '1970-01-03 12:31:26',
               '1970-01-04 15:19:20', '1970-01-01 20:49:14',
               '1970-01-03 09:17:37', '1970-01-02 20:34:43'],
              dtype='datetime64[ns]', name='time', length=71664, freq=None)


In [66]:
# 在原数据中增加天数，小时和周数，因为年和月都是一样的，不用管
data['day'] = time.day
data['hour'] = time.hour
data['weekday'] = time.weekday
print(data.head())

      row_id       x       y  accuracy    time    place_id  day  hour  weekday
163      163  2.1663  2.3755        84  669737  3869813743    8    18        3
310      310  2.3695  2.2034         3  234719  2636621520    3    17        5
658      658  2.3236  2.1768        66  502343  7877745055    6    19        1
1368    1368  2.2613  2.3392        73  319822  9775192577    4    16        6
1627    1627  2.3331  2.0011        66  595084  6731326909    7    21        2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['day'] = time.day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['hour'] = time.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['weekday'] = time.weekday


In [70]:
# 删除原来的time列
data = data.drop(['time'],axis=1)
print(data.head())

      row_id       x       y  accuracy    place_id  day  hour  weekday
163      163  2.1663  2.3755        84  3869813743    8    18        3
310      310  2.3695  2.2034         3  2636621520    3    17        5
658      658  2.3236  2.1768        66  7877745055    6    19        1
1368    1368  2.2613  2.3392        73  9775192577    4    16        6
1627    1627  2.3331  2.0011        66  6731326909    7    21        2


In [72]:
# 按地点分组，并统计每个地点去了多少次
place_count = data.groupby('place_id').count()
print(place_count)

            row_id    x    y  accuracy  day  hour  weekday
place_id                                                  
1006234733       1    1    1         1    1     1        1
1008823061       4    4    4         4    4     4        4
1012580558       3    3    3         3    3     3        3
1025585791      21   21   21        21   21    21       21
1026507711     220  220  220       220  220   220      220
...            ...  ...  ...       ...  ...   ...      ...
9986101718       1    1    1         1    1     1        1
9993141712       1    1    1         1    1     1        1
9995108787      23   23   23        23   23    23       23
9998968845      99   99   99        99   99    99       99
9999851158       3    3    3         3    3     3        3

[2524 rows x 7 columns]


In [73]:
# 删除那些去较少的地点,留下去过5次以上的地点
place_count = place_count[place_count['row_id']>5]
data = data[data['place_id'].isin(place_count.index)]

In [75]:
print(data.shape)
print(data.head())

(68536, 8)
      row_id       x       y  accuracy    place_id  day  hour  weekday
163      163  2.1663  2.3755        84  3869813743    8    18        3
310      310  2.3695  2.2034         3  2636621520    3    17        5
658      658  2.3236  2.1768        66  7877745055    6    19        1
1368    1368  2.2613  2.3392        73  9775192577    4    16        6
1627    1627  2.3331  2.0011        66  6731326909    7    21        2


In [76]:
# 确定特征值和目标值
# 目标值是预测要去的地方，所有place_id是目标值，其他都是特征值
x = data.drop(['place_id'],axis=1)
x = x.drop(['row_id'],axis=1)

In [77]:
y = data.place_id

In [80]:
print(x.shape)
print(x.head())
print('-' * 30)
print(y.shape)
print(y.head())

(68536, 6)
           x       y  accuracy  day  hour  weekday
163   2.1663  2.3755        84    8    18        3
310   2.3695  2.2034         3    3    17        5
658   2.3236  2.1768        66    6    19        1
1368  2.2613  2.3392        73    4    16        6
1627  2.3331  2.0011        66    7    21        2
------------------------------
(68536,)
163     3869813743
310     2636621520
658     7877745055
1368    9775192577
1627    6731326909
Name: place_id, dtype: int64


In [81]:
# 进行数据集划分
x_train,x_test, y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=2)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(51402, 6)
(51402,)
(17134, 6)
(17134,)


In [82]:
# 特征工程，进行数据标准化
stand = StandardScaler()
x_train = stand.fit_transform(x_train)
x_test = stand.transform(x_test)

In [105]:
# 实例化k近邻分类器
knn = KNeighborsClassifier()
# 设置超参数
param = {"n_neighbors": [2, 4, 5, 8, 15]}
knn = GridSearchCV(knn,param_grid=param,cv=5)

In [106]:
# 开始训练
knn.fit(x_train,y_train)



In [107]:
# 模型评估
score = knn.score(x_test,y_test)
print(f'准确率为：{score}')

准确率为：0.3739348663476129


In [108]:
# 查看交叉验证结果
print(knn.best_score_)
print('-'*30)
print(knn.best_estimator_)

0.35854644603376634
------------------------------
KNeighborsClassifier()


## 4、完成对20类新闻数据集，通过贝叶斯分析，实现分类预测，理解贝叶斯公式计算方法，得出，精确率，召回率，F1-score,并理解计算某个分类的AUC值的操作

In [109]:
# 获取数据集，已下载本地了，可以直接从本地读取
news = fetch_20newsgroups(subset='all',data_home='../data')

In [116]:
# 查看数据情况
print(len(news.data))  # list类型，一共18846数据
print(news.target.shape)
print(news.target_names) # 目标值的名称
print(len(news.target_names))  # 共20个类别

18846
(18846,)
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
20


In [117]:
# 进行训练集和测试集的划分
x_train,x_test,y_train,y_test = train_test_split(
    news.data,news.target,test_size=0.25,random_state=1)

In [119]:
# 对文本求Tf-idf，表示文本当中每个词的重要程度
tf_idf = TfidfVectorizer()
# 对训练集和测试集当中的词进行重要性统计
x_train = tf_idf.fit_transform(x_train)
x_test = tf_idf.transform(x_test)

In [125]:
# alpha是拉普拉斯平滑系数
mlt = MultinomialNB(alpha=1.0)

In [126]:
# 利用朴素贝叶斯算法开始训练
mlt.fit(x_train,y_train)

In [127]:
# 利用测试集进行模型评估
score = mlt.score(x_test,y_test)
print(f'准确率为：{score}')

准确率为：0.8518675721561969


In [128]:
# 预测文章的类别
y_predict = mlt.predict(x_test)
print(y_predict)

[16 19 18 ... 13  7 14]


In [129]:
# 计算每个类别的精确率和召回率
# 传入真实目标和预测目标
# 返回每个类别的精确率，召回率，f1-score和真实的样本有多少
target_rate=classification_report(
    y_test,y_predict,target_names=news.target_names)
print(target_rate)

                          precision    recall  f1-score   support

             alt.atheism       0.91      0.77      0.83       199
           comp.graphics       0.83      0.79      0.81       242
 comp.os.ms-windows.misc       0.89      0.83      0.86       263
comp.sys.ibm.pc.hardware       0.80      0.83      0.81       262
   comp.sys.mac.hardware       0.90      0.88      0.89       234
          comp.windows.x       0.92      0.85      0.88       230
            misc.forsale       0.96      0.67      0.79       257
               rec.autos       0.90      0.87      0.88       265
         rec.motorcycles       0.90      0.95      0.92       251
      rec.sport.baseball       0.89      0.96      0.93       226
        rec.sport.hockey       0.95      0.98      0.96       262
               sci.crypt       0.76      0.97      0.85       257
         sci.electronics       0.84      0.80      0.82       229
                 sci.med       0.97      0.86      0.91       249
         

In [131]:
# 计算第10类的AUC
y_test1 = np.where(y_test == 10, 1, 0)
y_predict1 = np.where(y_predict == 10, 1, 0)
print(f'AUC:{roc_auc_score(y_test1,y_predict1)}')

AUC:0.9870889441633072
