In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
weather = pd.read_csv("./weatherAUS5000.csv", index_col=0)

In [3]:
weather.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainTomorrow
0,2015-03-24,Adelaide,12.3,19.3,0.0,5.0,,S,39.0,S,...,19.0,59.0,47.0,1022.2,1021.4,,,15.1,17.7,No
1,2011-07-12,Adelaide,7.9,11.4,0.0,1.0,0.5,N,20.0,NNE,...,7.0,70.0,59.0,1028.7,1025.7,,,8.4,11.3,No
2,2010-02-08,Adelaide,24.0,38.1,0.0,23.4,13.0,SE,39.0,NNE,...,19.0,36.0,24.0,1018.0,1016.0,,,32.4,37.4,No
3,2016-09-19,Adelaide,6.7,16.4,0.4,,,N,31.0,N,...,15.0,65.0,40.0,1014.4,1010.0,,,11.2,15.9,No
4,2014-03-05,Adelaide,16.7,24.8,0.0,6.6,11.7,S,37.0,S,...,24.0,61.0,48.0,1019.3,1018.9,,,20.8,23.7,No


In [4]:
X = weather.iloc[:, :-1]

In [5]:
Y = weather.iloc[:, -1]

In [6]:
X.shape

(5000, 21)

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           5000 non-null   object 
 1   Location       5000 non-null   object 
 2   MinTemp        4979 non-null   float64
 3   MaxTemp        4987 non-null   float64
 4   Rainfall       4950 non-null   float64
 5   Evaporation    2841 non-null   float64
 6   Sunshine       2571 non-null   float64
 7   WindGustDir    4669 non-null   object 
 8   WindGustSpeed  4669 non-null   float64
 9   WindDir9am     4651 non-null   object 
 10  WindDir3pm     4887 non-null   object 
 11  WindSpeed9am   4949 non-null   float64
 12  WindSpeed3pm   4919 non-null   float64
 13  Humidity9am    4936 non-null   float64
 14  Humidity3pm    4880 non-null   float64
 15  Pressure9am    4506 non-null   float64
 16  Pressure3pm    4504 non-null   float64
 17  Cloud9am       3111 non-null   float64
 18  Cloud3pm

In [8]:
X.isnull().mean()

Date             0.0000
Location         0.0000
MinTemp          0.0042
MaxTemp          0.0026
Rainfall         0.0100
Evaporation      0.4318
Sunshine         0.4858
WindGustDir      0.0662
WindGustSpeed    0.0662
WindDir9am       0.0698
WindDir3pm       0.0226
WindSpeed9am     0.0102
WindSpeed3pm     0.0162
Humidity9am      0.0128
Humidity3pm      0.0240
Pressure9am      0.0988
Pressure3pm      0.0992
Cloud9am         0.3778
Cloud3pm         0.3976
Temp9am          0.0066
Temp3pm          0.0176
dtype: float64

In [9]:
# 标签分类
np.unique(Y)

array(['No', 'Yes'], dtype=object)

In [10]:
# 分集，描述性统计
# 先分集，再数据预处理
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, random_state=420)

# 恢复索引
for i in [Xtrain, Xtest, Ytrain, Ytest]:
    i.index = range(i.shape[0])

In [11]:
# 样本不均衡
Ytrain.value_counts()

No     2704
Yes     796
Name: RainTomorrow, dtype: int64

In [12]:
Ytest.value_counts()

No     1157
Yes     343
Name: RainTomorrow, dtype: int64

In [13]:
# 将标签编码
from sklearn.preprocessing import LabelEncoder

# 用训练集训练，在训练集和测试机上分别 transform
encoder = LabelEncoder().fit(Ytrain)
Ytrain = pd.DataFrame(encoder.transform(Ytrain))
Ytest = pd.DataFrame(encoder.transform(Ytest))

In [14]:
# 探索特征矩阵

Xtrain.describe([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]).T

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,99%,max
MinTemp,3486.0,12.225645,6.396243,-6.5,-1.715,1.8,4.1,7.7,12.0,16.7,20.9,25.9,29.0
MaxTemp,3489.0,23.245543,7.201839,-3.7,8.888,12.84,14.5,18.0,22.5,28.4,33.0,40.4,46.4
Rainfall,3467.0,2.487049,7.949686,0.0,0.0,0.0,0.0,0.0,0.0,0.8,6.6,41.272,115.8
Evaporation,1983.0,5.619163,4.383098,0.0,0.4,0.8,1.4,2.6,4.8,7.4,10.2,20.6,56.0
Sunshine,1790.0,7.508659,3.805841,0.0,0.0,0.345,1.4,4.6,8.3,10.6,12.0,13.3,13.9
WindGustSpeed,3263.0,39.858413,13.219607,9.0,15.0,20.0,24.0,31.0,39.0,48.0,57.0,76.0,117.0
WindSpeed9am,3466.0,14.046163,8.670472,0.0,0.0,0.0,4.0,7.0,13.0,19.0,26.0,37.0,65.0
WindSpeed3pm,3437.0,18.55339,8.611818,0.0,2.0,6.0,7.0,13.0,19.0,24.0,30.0,43.0,65.0
Humidity9am,3459.0,69.069095,18.787698,2.0,18.0,35.0,45.0,57.0,70.0,83.0,94.0,100.0,100.0
Humidity3pm,3408.0,51.651995,20.697872,2.0,9.0,17.0,23.0,37.0,52.0,66.0,79.0,98.0,100.0


In [15]:
Xtest.describe([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]).T

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,99%,max
MinTemp,1493.0,11.916812,6.375377,-8.5,-2.024,1.6,3.7,7.3,11.8,16.5,20.48,25.316,28.3
MaxTemp,1498.0,22.906809,6.986043,-0.8,9.134,13.0,14.5,17.8,22.4,27.8,32.6,38.303,45.1
Rainfall,1483.0,2.241807,7.988822,0.0,0.0,0.0,0.0,0.0,0.0,0.8,5.2,35.372,108.2
Evaporation,858.0,5.657809,4.105762,0.0,0.4,1.0,1.6,2.8,4.8,7.6,10.4,19.458,38.8
Sunshine,781.0,7.677465,3.862294,0.0,0.0,0.3,1.5,4.7,8.6,10.7,12.2,13.4,13.9
WindGustSpeed,1406.0,40.044097,14.027052,9.0,15.0,20.0,24.0,30.0,39.0,48.0,57.0,78.0,122.0
WindSpeed9am,1483.0,13.986514,9.124337,0.0,0.0,0.0,4.0,7.0,13.0,20.0,26.0,39.36,72.0
WindSpeed3pm,1482.0,18.601215,8.850446,0.0,2.0,6.0,7.0,13.0,19.0,24.0,31.0,43.0,56.0
Humidity9am,1477.0,68.688558,18.876448,4.0,20.0,36.0,44.0,57.0,69.0,82.0,95.0,100.0,100.0
Humidity3pm,1472.0,51.431386,20.459957,2.0,8.71,18.0,23.0,37.0,52.0,66.0,78.0,96.29,100.0


In [16]:
# 处理困难特征 -- 日期

Xtrainc = Xtrain.copy()

Xtrainc.sort_values(by="Location")

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
2796,2015-03-24,Adelaide,12.3,19.3,0.0,5.0,,S,39.0,S,...,13.0,19.0,59.0,47.0,1022.2,1021.4,,,15.1,17.7
2975,2012-08-17,Adelaide,7.8,13.2,17.6,0.8,,SW,61.0,SW,...,20.0,28.0,76.0,47.0,1012.5,1014.7,,,8.3,12.5
775,2013-03-16,Adelaide,17.4,23.8,,,9.7,SSE,46.0,S,...,9.0,19.0,63.0,57.0,1019.9,1020.5,,,19.1,20.7
861,2011-07-12,Adelaide,7.9,11.4,0.0,1.0,0.5,N,20.0,NNE,...,7.0,7.0,70.0,59.0,1028.7,1025.7,,,8.4,11.3
2906,2015-08-24,Adelaide,9.2,14.3,0.0,,,SE,48.0,SE,...,17.0,19.0,64.0,42.0,1024.7,1024.1,,,9.9,13.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2223,2009-05-08,Woomera,9.2,20.6,0.0,5.2,10.4,ESE,37.0,SE,...,19.0,19.0,64.0,34.0,1030.5,1026.9,0.0,1.0,13.7,20.1
1984,2014-05-26,Woomera,15.5,23.6,0.0,24.0,,NNW,43.0,NNE,...,9.0,26.0,49.0,37.0,1014.2,1010.3,7.0,7.0,18.0,21.5
1592,2012-01-10,Woomera,16.8,26.7,0.0,10.0,5.3,SW,46.0,S,...,20.0,22.0,52.0,33.0,1019.1,1016.8,4.0,6.0,18.3,24.9
2824,2015-11-03,Woomera,16.2,28.5,7.8,4.2,4.5,WSW,80.0,NE,...,26.0,50.0,76.0,53.0,1009.6,1006.8,6.0,7.0,20.5,26.2


In [17]:
Xtrain.iloc[:,0].value_counts()

2015-10-12    6
2014-05-16    6
2015-07-03    6
2009-06-29    5
2014-06-16    5
             ..
2016-11-12    1
2011-03-07    1
2013-03-11    1
2014-11-01    1
2009-04-28    1
Name: Date, Length: 2141, dtype: int64

In [18]:
Xtrain["Rainfall"].head(20)

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
5      0.0
6      0.0
7      0.2
8      0.0
9      0.2
10     1.0
11     0.0
12     0.2
13     0.0
14     0.0
15     3.0
16     0.2
17     0.0
18    35.2
19     0.0
Name: Rainfall, dtype: float64

In [19]:
Xtrain.loc[Xtrain["Rainfall"] >= 1, "RainToday"] = "Yes"
Xtrain.loc[Xtrain["Rainfall"] < 1, "RainToday"] = "No"
Xtrain.loc[Xtrain["Rainfall"] == np.nan, "RainToday"] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [20]:
Xtest.loc[Xtest["Rainfall"] >= 1, "RainToday"] = "Yes"
Xtest.loc[Xtest["Rainfall"] < 1, "RainToday"] = "No"
Xtest.loc[Xtest["Rainfall"] == np.nan, "RainToday"] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [21]:
Xtrain.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,2015-08-24,Katherine,17.5,36.0,0.0,8.8,,ESE,26.0,NNW,...,15.0,57.0,,1016.8,1012.2,0.0,,27.5,,No
1,2016-12-10,Tuggeranong,9.5,25.0,0.0,,,NNW,33.0,NE,...,17.0,59.0,31.0,1020.4,1017.5,,,14.6,23.6,No
2,2010-04-18,Albany,13.0,22.6,0.0,3.8,10.4,,,NE,...,31.0,79.0,68.0,1020.3,1015.7,1.0,3.0,17.5,20.8,No
3,2009-11-26,Sale,13.9,29.8,0.0,5.8,5.1,S,37.0,N,...,28.0,82.0,44.0,1012.5,1005.9,6.0,6.0,18.5,27.5,No
4,2014-04-25,Mildura,6.0,23.5,0.0,2.8,8.6,NNE,24.0,E,...,15.0,58.0,35.0,1019.8,1014.1,2.0,4.0,12.4,22.4,No


In [22]:
Xtest.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,2016-01-23,NorahHead,22.0,27.8,25.2,,,SSW,57.0,S,...,37.0,91.0,86.0,1006.6,1008.1,,,26.2,23.1,Yes
1,2009-03-05,MountGambier,12.0,18.6,2.2,3.0,7.8,SW,52.0,SW,...,28.0,88.0,62.0,1020.2,1019.9,8.0,7.0,14.8,17.5,Yes
2,2010-03-05,MountGinini,9.1,13.3,,,,NE,41.0,,...,,,,,,,,,,
3,2013-10-26,Wollongong,13.1,20.3,0.0,,,SW,33.0,W,...,24.0,40.0,51.0,1021.3,1019.5,,,16.8,19.6,No
4,2016-11-28,Sale,12.2,20.0,0.4,,,E,33.0,SW,...,19.0,92.0,69.0,1015.6,1013.2,8.0,4.0,13.6,19.0,No


In [23]:
# 提取月份信息

int(Xtrain.loc[0, "Date"].split("-")[1])

8

In [24]:
Xtrain.loc[:, "Date"] = Xtrain.loc[:, "Date"].apply(lambda x:int(x.split("-")[1]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [25]:
Xtrain = Xtrain.rename(columns={"Date": "Month"})

In [26]:
Xtrain

Unnamed: 0,Month,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,8,Katherine,17.5,36.0,0.0,8.8,,ESE,26.0,NNW,...,15.0,57.0,,1016.8,1012.2,0.0,,27.5,,No
1,12,Tuggeranong,9.5,25.0,0.0,,,NNW,33.0,NE,...,17.0,59.0,31.0,1020.4,1017.5,,,14.6,23.6,No
2,4,Albany,13.0,22.6,0.0,3.8,10.4,,,NE,...,31.0,79.0,68.0,1020.3,1015.7,1.0,3.0,17.5,20.8,No
3,11,Sale,13.9,29.8,0.0,5.8,5.1,S,37.0,N,...,28.0,82.0,44.0,1012.5,1005.9,6.0,6.0,18.5,27.5,No
4,4,Mildura,6.0,23.5,0.0,2.8,8.6,NNE,24.0,E,...,15.0,58.0,35.0,1019.8,1014.1,2.0,4.0,12.4,22.4,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3495,9,NorahHead,16.4,23.9,0.0,,,NNE,50.0,N,...,30.0,77.0,80.0,1029.6,1024.4,,,20.1,20.3,No
3496,4,Wollongong,18.3,21.4,22.6,,,SSW,52.0,SSW,...,9.0,87.0,89.0,1018.0,1016.7,2.0,8.0,19.6,18.3,Yes
3497,11,Witchcliffe,12.4,32.2,0.0,,,N,61.0,N,...,30.0,29.0,25.0,1015.5,1014.7,,,27.4,32.1,No
3498,1,Watsonia,16.1,23.6,0.0,12.8,11.4,W,50.0,WSW,...,22.0,56.0,37.0,1007.7,1011.2,6.0,2.0,18.0,22.8,No


In [27]:
Xtest.loc[:, "Date"] = Xtest.loc[:, "Date"].apply( lambda x: x.split("-")[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [28]:
Xtest = Xtest.rename(columns={"Date": "Month"})

In [32]:
# 处理困难特征：地点

# 爬虫部分，爬需要的城市的经纬度

import time
from selenium import webdriver

In [33]:
cityname = X.loc[:, "Location"].unique().tolist()
cityname

['Adelaide',
 'Albany',
 'Albury',
 'AliceSprings',
 'BadgerysCreek',
 'Ballarat',
 'Bendigo',
 'Brisbane',
 'Cairns',
 'Canberra',
 'Cobar',
 'CoffsHarbour',
 'Dartmoor',
 'Darwin',
 'GoldCoast',
 'Hobart',
 'Katherine',
 'Launceston',
 'Melbourne',
 'MelbourneAirport',
 'Mildura',
 'Moree',
 'MountGambier',
 'MountGinini',
 'Newcastle',
 'Nhil',
 'NorahHead',
 'NorfolkIsland',
 'Nuriootpa',
 'PearceRAAF',
 'Penrith',
 'Perth',
 'PerthAirport',
 'Portland',
 'Richmond',
 'Sale',
 'SalmonGums',
 'Sydney',
 'SydneyAirport',
 'Townsville',
 'Tuggeranong',
 'Uluru',
 'WaggaWagga',
 'Walpole',
 'Watsonia',
 'Williamtown',
 'Witchcliffe',
 'Wollongong',
 'Woomera']

In [105]:
driver = webdriver.Chrome()

In [47]:
df = pd.DataFrame(index=range(len(cityname)))

In [118]:
city = cityname[3]
city

'AliceSprings'

In [114]:
driver.get("https://cn.bing.com/?ensearch=1&FORM=BEHPTB")
time.sleep(0.3)

In [115]:
search_box = driver.find_element_by_name("q")

In [116]:
search_box.send_keys("{} Latitude and longitude".format(cityname[3]))

In [117]:
search_box.submit()

In [120]:
# TODO: 如何处理搜索结果，暂时不清楚

In [127]:
# 最终结果

cityll = pd.read_csv("cityll.csv") # 城市对应的经纬度，爬虫的结果
city_climate = pd.read_csv("Cityclimate.csv") # 城市对应的气候

In [128]:
cityll.head()

Unnamed: 0.1,Unnamed: 0,City,Latitude,Longitude,Latitudedir,Longitudedir
0,0,Adelaide,34.9285°,138.6007°,"S,",E
1,1,Albany,35.0275°,117.8840°,"S,",E
2,2,Albury,36.0737°,146.9135°,"S,",E
3,3,Wodonga,36.1241°,146.8818°,"S,",E
4,4,AliceSprings,23.6980°,133.8807°,"S,",E


In [129]:
city_climate.head()

Unnamed: 0,City,Climate
0,Adelaide,Warm temperate
1,Albany,Mild temperate
2,Albury,"Hot dry summer, cool winter"
3,Wodonga,"Hot dry summer, cool winter"
4,AliceSprings,"Hot dry summer, warm winter"


In [130]:
# 去掉经纬度的度数符号
cityll.loc[:, "Latitude"] = cityll.loc[:, "Latitude"].apply(lambda x: x[:-1])
cityll.loc[:, "Longitude"] = cityll.loc[:, "Longitude"].apply(lambda x: x[:-1])

In [131]:
cityll.head()

Unnamed: 0.1,Unnamed: 0,City,Latitude,Longitude,Latitudedir,Longitudedir
0,0,Adelaide,34.9285,138.6007,"S,",E
1,1,Albany,35.0275,117.884,"S,",E
2,2,Albury,36.0737,146.9135,"S,",E
3,3,Wodonga,36.1241,146.8818,"S,",E
4,4,AliceSprings,23.698,133.8807,"S,",E


In [135]:
# 观察到所有的经纬度方向都是一致的，经纬度的方向可以舍弃

citylld = cityll.iloc[:, [1, 2, 3]]

In [166]:
citylld.iloc[:, 1] = citylld.iloc[:, 1].apply(lambda x: float(x))
citylld.iloc[:, 2] = citylld.iloc[:, 2].apply(lambda x: float(x))
citylld.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Unnamed: 0,City,Latitude,Longitude,climate
0,Adelaide,34.9285,138.6007,Warm temperate
1,Albany,35.0275,117.884,Mild temperate
2,Albury,36.0737,146.9135,"Hot dry summer, cool winter"
3,Wodonga,36.1241,146.8818,"Hot dry summer, cool winter"
4,AliceSprings,23.698,133.8807,"Hot dry summer, warm winter"


In [138]:
citylld.loc[:, "climate"] = city_climate.iloc[:, -1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [139]:
citylld.head()

Unnamed: 0,City,Latitude,Longitude,climate
0,Adelaide,34.9285,138.6007,Warm temperate
1,Albany,35.0275,117.884,Mild temperate
2,Albury,36.0737,146.9135,"Hot dry summer, cool winter"
3,Wodonga,36.1241,146.8818,"Hot dry summer, cool winter"
4,AliceSprings,23.698,133.8807,"Hot dry summer, warm winter"


In [141]:
city_climate.head()

Unnamed: 0,City,Climate
0,Adelaide,Warm temperate
1,Albany,Mild temperate
2,Albury,"Hot dry summer, cool winter"
3,Wodonga,"Hot dry summer, cool winter"
4,AliceSprings,"Hot dry summer, warm winter"


In [142]:
# 计算距离，所有地点
citylld.iloc[:, 1].value_counts().index.tolist()

['17.9614',
 '37.8284',
 '34.7240',
 '33.8688',
 '33.6800',
 '25.5233',
 '21.6924',
 '26.1509',
 '17.3179',
 '41.4332',
 '33.8608',
 '25.6406',
 '12.4634',
 '23.4422',
 '31.0582',
 '12.1825',
 '24.8840',
 '25.2335',
 '32.1306',
 '33.0945',
 '35.6531',
 '31.4837',
 '33.9399',
 '34.1743',
 '23.3593',
 '23.6980',
 '36.0737',
 '26.5950',
 '27.4698',
 '20.7256',
 '19.6484',
 '32.1960',
 '36.8365',
 '14.4521',
 '28.5387',
 '34.4278',
 '29.4331',
 '31.0927',
 '42.8821',
 '27.5423',
 '33.0850',
 '19.2590',
 '32.9283',
 '26.4021',
 '34.9285',
 '15.4758',
 '30.2986',
 '33.4193',
 '27.9738',
 '21.1425',
 '31.9505',
 '12.6493',
 '29.0139',
 '16.9186',
 '25.8989',
 '36.7189',
 '42.1500',
 '20.3107',
 '34.2080',
 '36.1241',
 '37.8136',
 '31.6772',
 '16.0703',
 '31.9539',
 '30.5908',
 '25.0537',
 '33.3256',
 '33.0380',
 '25.8436',
 '36.3833',
 '33.9550',
 '15.4825',
 '31.9440',
 '35.2809',
 '26.1353',
 '23.3791',
 '30.7490',
 '41.1771',
 '15.6432',
 '35.5258',
 '37.7703',
 '28.7774',
 '25.3444',
 '21

In [161]:
samplecity = pd.read_csv("samplecity.csv")

In [162]:
samplecity.head()

Unnamed: 0.1,Unnamed: 0,City,Latitude,Longitude,Latitudedir,Longitudedir
0,0,Canberra,35.2809°,149.1300°,"S,",E
1,1,Sydney,33.8688°,151.2093°,"S,",E
2,2,Perth,31.9505°,115.8605°,"S,",E
3,3,Darwin,12.4634°,130.8456°,"S,",E
4,4,Hobart,42.8821°,147.3272°,"S,",E


In [163]:
samplecity["Latitude"] = samplecity["Latitude"].apply(lambda x:float(x[:-1]))
samplecity["Longitude"] = samplecity["Longitude"].apply(lambda x:float(x[:-1]))

samplecityd = samplecity.iloc[:,[1,2,3]]
type(samplecityd.iloc[0, 1])

numpy.float64

In [164]:
samplecityd.iloc[0, 1]

35.2809

In [167]:
# 角度转弧度

from math import radians, sin, cos, acos

citylld.loc[:,"slat"] = citylld.iloc[:,1].apply(lambda x : radians(x))
citylld.loc[:,"slon"] = citylld.iloc[:,2].apply(lambda x : radians(x))
samplecityd.loc[:,"elat"] = samplecityd.iloc[:,1].apply(lambda x : radians(x))
samplecityd.loc[:,"elon"] = samplecityd.iloc[:,2].apply(lambda x : radians(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [168]:
citylld.head()

Unnamed: 0,City,Latitude,Longitude,climate,slat,slon
0,Adelaide,34.9285,138.6007,Warm temperate,0.609617,2.419039
1,Albany,35.0275,117.884,Mild temperate,0.611345,2.057464
2,Albury,36.0737,146.9135,"Hot dry summer, cool winter",0.629605,2.564124
3,Wodonga,36.1241,146.8818,"Hot dry summer, cool winter",0.630484,2.563571
4,AliceSprings,23.698,133.8807,"Hot dry summer, warm winter",0.413608,2.336659


In [169]:
# 代入公式

import sys
for i in range(samplecityd.shape[0]):
    slat = citylld.loc[:,"slat"]
    slon = citylld.loc[:,"slon"]
    elat = samplecityd.loc[i,"elat"]
    elon = samplecityd.loc[i,"elon"]
    dist = 6371.01 * np.arccos(np.sin(slat)*np.sin(elat) +
                               np.cos(slat)*np.cos(elat)*np.cos(slon.values - elon))
    city_index = np.argsort(dist)[0]
    #每次计算后,取距离最近的城市,然后将最近的城市和城市对应的气候都匹配到samplecityd中
    samplecityd.loc[i,"closest_city"] = citylld.loc[city_index,"City"]
    samplecityd.loc[i,"climate"] = citylld.loc[city_index,"climate"]



  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,City,Latitude,Longitude,elat,elon,closest_city,climate
0,Canberra,35.2809,149.13,0.615768,2.60281,Canberra,Cool temperate
1,Sydney,33.8688,151.2093,0.591122,2.6391,Sydney,Warm temperate
2,Perth,31.9505,115.8605,0.557641,2.022147,Perth,Warm temperate
3,Darwin,12.4634,130.8456,0.217527,2.283687,Darwin,"High humidity summer, warm winter"
4,Hobart,42.8821,147.3272,0.748434,2.571345,Hobart,Cool temperate


In [170]:

#查看最后的结果,需要检查城市匹配是否基本正确
samplecityd.head(300)


Unnamed: 0,City,Latitude,Longitude,elat,elon,closest_city,climate
0,Canberra,35.2809,149.13,0.615768,2.60281,Canberra,Cool temperate
1,Sydney,33.8688,151.2093,0.591122,2.6391,Sydney,Warm temperate
2,Perth,31.9505,115.8605,0.557641,2.022147,Perth,Warm temperate
3,Darwin,12.4634,130.8456,0.217527,2.283687,Darwin,"High humidity summer, warm winter"
4,Hobart,42.8821,147.3272,0.748434,2.571345,Hobart,Cool temperate
5,Brisbane,27.4698,153.0251,0.479438,2.670792,Brisbane,"Warm humid summer, mild winter"
6,Adelaide,34.9285,138.6007,0.609617,2.419039,Adelaide,Warm temperate
7,Bendigo,36.757,144.2794,0.641531,2.518151,Ballarat,Cool temperate
8,Townsville,19.259,146.8169,0.336133,2.562438,Townsville,"High humidity summer, warm winter"
9,AliceSprings,23.698,133.8807,0.413608,2.336659,AliceSprings,"Hot dry summer, warm winter"


In [175]:
#查看气候的分布
samplecityd["climate"].value_counts()

#确认无误后,取出样本城市所对应的气候,并保存
locafinal = samplecityd.iloc[:,[0,-1]]
locafinal.head()
locafinal.columns = ["Location","Climate"]



In [176]:
locafinal.head()

Unnamed: 0,Location,Climate
0,Canberra,Cool temperate
1,Sydney,Warm temperate
2,Perth,Warm temperate
3,Darwin,"High humidity summer, warm winter"
4,Hobart,Cool temperate


In [177]:
#在这里设定locafinal的索引为地点,是为了之后进行map的匹配
locafinal = locafinal.set_index(keys="Location")
locafinal.head()

Unnamed: 0_level_0,Climate
Location,Unnamed: 1_level_1
Canberra,Cool temperate
Sydney,Warm temperate
Perth,Warm temperate
Darwin,"High humidity summer, warm winter"
Hobart,Cool temperate


In [178]:
# 用气候替换掉原有城市

Xtrain.head()

Unnamed: 0,Month,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,8,Katherine,17.5,36.0,0.0,8.8,,ESE,26.0,NNW,...,15.0,57.0,,1016.8,1012.2,0.0,,27.5,,No
1,12,Tuggeranong,9.5,25.0,0.0,,,NNW,33.0,NE,...,17.0,59.0,31.0,1020.4,1017.5,,,14.6,23.6,No
2,4,Albany,13.0,22.6,0.0,3.8,10.4,,,NE,...,31.0,79.0,68.0,1020.3,1015.7,1.0,3.0,17.5,20.8,No
3,11,Sale,13.9,29.8,0.0,5.8,5.1,S,37.0,N,...,28.0,82.0,44.0,1012.5,1005.9,6.0,6.0,18.5,27.5,No
4,4,Mildura,6.0,23.5,0.0,2.8,8.6,NNE,24.0,E,...,15.0,58.0,35.0,1019.8,1014.1,2.0,4.0,12.4,22.4,No


In [179]:
# 将 Location 中的内容替换，确保匹配进入的气候字符串不含逗号，气候两边不含空格
# 用 re 消除逗号

import re

Xtrain.loc[:, "Location"] = Xtrain.loc[:, "Location"].map(locafinal.iloc[:, 0]).apply(lambda x: re.sub(",", "", x.strip()))

In [180]:
Xtrain.head()

Unnamed: 0,Month,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,8,High humidity summer warm winter,17.5,36.0,0.0,8.8,,ESE,26.0,NNW,...,15.0,57.0,,1016.8,1012.2,0.0,,27.5,,No
1,12,Cool temperate,9.5,25.0,0.0,,,NNW,33.0,NE,...,17.0,59.0,31.0,1020.4,1017.5,,,14.6,23.6,No
2,4,Mild temperate,13.0,22.6,0.0,3.8,10.4,,,NE,...,31.0,79.0,68.0,1020.3,1015.7,1.0,3.0,17.5,20.8,No
3,11,Mild temperate,13.9,29.8,0.0,5.8,5.1,S,37.0,N,...,28.0,82.0,44.0,1012.5,1005.9,6.0,6.0,18.5,27.5,No
4,4,Hot dry summer cool winter,6.0,23.5,0.0,2.8,8.6,NNE,24.0,E,...,15.0,58.0,35.0,1019.8,1014.1,2.0,4.0,12.4,22.4,No


In [181]:
Xtest.loc[:, "Location"] = Xtest.loc[:, "Location"].map(locafinal.iloc[:, 0]).apply(lambda x: re.sub(",", "", x.strip()))

In [182]:
Xtrain.rename(columns={"Location": "Climate"})
Xtest.rename(columns={"Location": "Climate"})

Unnamed: 0,Month,Climate,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,01,Cool temperate,22.0,27.8,25.2,,,SSW,57.0,S,...,37.0,91.0,86.0,1006.6,1008.1,,,26.2,23.1,Yes
1,03,Mild temperate,12.0,18.6,2.2,3.0,7.8,SW,52.0,SW,...,28.0,88.0,62.0,1020.2,1019.9,8.0,7.0,14.8,17.5,Yes
2,03,Cool temperate,9.1,13.3,,,,NE,41.0,,...,,,,,,,,,,
3,10,Warm temperate,13.1,20.3,0.0,,,SW,33.0,W,...,24.0,40.0,51.0,1021.3,1019.5,,,16.8,19.6,No
4,11,Mild temperate,12.2,20.0,0.4,,,E,33.0,SW,...,19.0,92.0,69.0,1015.6,1013.2,8.0,4.0,13.6,19.0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,05,Hot dry summer cool winter,9.1,15.8,11.2,,,WNW,30.0,,...,13.0,100.0,64.0,1022.6,1021.8,1.0,7.0,11.0,14.0,Yes
1496,05,Hot dry summer warm winter,9.6,21.1,0.0,,,E,26.0,ESE,...,4.0,36.0,23.0,1023.2,1020.9,8.0,8.0,14.2,20.3,No
1497,11,Hot dry summer cool winter,5.5,25.9,0.0,,,S,43.0,SE,...,17.0,44.0,25.0,,,,,18.4,24.9,No
1498,08,Hot dry summer cool winter,2.6,15.4,0.0,1.6,5.2,NW,41.0,ENE,...,24.0,79.0,54.0,1015.9,1011.5,6.0,5.0,8.1,13.3,No


In [184]:
# 查看缺失值情况
Xtrain.isnull().mean()

Month            0.000000
Location         0.000000
MinTemp          0.004000
MaxTemp          0.003143
Rainfall         0.009429
Evaporation      0.433429
Sunshine         0.488571
WindGustDir      0.067714
WindGustSpeed    0.067714
WindDir9am       0.067429
WindDir3pm       0.024286
WindSpeed9am     0.009714
WindSpeed3pm     0.018000
Humidity9am      0.011714
Humidity3pm      0.026286
Pressure9am      0.098857
Pressure3pm      0.098857
Cloud9am         0.379714
Cloud3pm         0.401429
Temp9am          0.005429
Temp3pm          0.019714
RainToday        0.009429
dtype: float64

In [187]:
# 首先找出分类型特征有哪些

cate = Xtrain.columns[Xtrain.dtypes == "object"].tolist()
cate

['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']

In [188]:
#除了特征类型为"object"的特征们,还有虽然用数字表示,但是本质为分类型特征的云层遮蔽程度
cloud = ["Cloud9am","Cloud3pm"]
cate = cate + cloud
cate

['Location',
 'WindGustDir',
 'WindDir9am',
 'WindDir3pm',
 'RainToday',
 'Cloud9am',
 'Cloud3pm']

In [190]:
# 对于分类型特征,我们使用众数来进行填补
from sklearn.impute import SimpleImputer

In [191]:
si = SimpleImputer(missing_values=np.nan,strategy="most_frequent")
si.fit(Xtrain.loc[:,cate])

SimpleImputer(strategy='most_frequent')

In [192]:
Xtrain.loc[:,cate] = si.transform(Xtrain.loc[:,cate])
Xtest.loc[:,cate] = si.transform(Xtest.loc[:,cate])

In [193]:
Xtrain.loc[:,cate].isnull().mean()

Location       0.0
WindGustDir    0.0
WindDir9am     0.0
WindDir3pm     0.0
RainToday      0.0
Cloud9am       0.0
Cloud3pm       0.0
dtype: float64

In [194]:
# 处理分类型变量(编码)

from sklearn.preprocessing import  OrdinalEncoder

oe = OrdinalEncoder()

In [195]:
oe = oe.fit(Xtrain.loc[:, cate])

In [196]:
# 用训练集的编码结果来编码训练和测试特征矩阵

Xtrain.loc[:,cate] = oe.transform(Xtrain.loc[:,cate])
Xtest.loc[:,cate] = oe.transform(Xtest.loc[:,cate])

In [197]:
# 处理连续性缺失值

col = Xtrain.columns.tolist()

In [198]:
for i in cate:
    col.remove(i)
    
col

['Month',
 'MinTemp',
 'MaxTemp',
 'Rainfall',
 'Evaporation',
 'Sunshine',
 'WindGustSpeed',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Temp9am',
 'Temp3pm']

In [199]:
impmean = SimpleImputer(missing_values=np.nan, strategy="mean")

In [200]:
impmean.fit(Xtrain.loc[:, col])

SimpleImputer()

In [203]:
Xtrain.loc[:, col] = impmean.transform(Xtrain.loc[:, col])
Xtest.loc[:, col] = impmean.transform(Xtest.loc[:, col])

In [204]:
Xtrain.head()

Unnamed: 0,Month,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,8.0,1.0,17.5,36.0,0.0,8.8,7.508659,2.0,26.0,6.0,...,15.0,57.0,51.651995,1016.8,1012.2,0.0,7.0,27.5,21.719003,0.0
1,12.0,0.0,9.5,25.0,0.0,5.619163,7.508659,6.0,33.0,4.0,...,17.0,59.0,31.0,1020.4,1017.5,7.0,7.0,14.6,23.6,0.0
2,4.0,4.0,13.0,22.6,0.0,3.8,10.4,13.0,39.858413,4.0,...,31.0,79.0,68.0,1020.3,1015.7,1.0,3.0,17.5,20.8,0.0
3,11.0,4.0,13.9,29.8,0.0,5.8,5.1,8.0,37.0,3.0,...,28.0,82.0,44.0,1012.5,1005.9,6.0,6.0,18.5,27.5,0.0
4,4.0,2.0,6.0,23.5,0.0,2.8,8.6,5.0,24.0,0.0,...,15.0,58.0,35.0,1019.8,1014.1,2.0,4.0,12.4,22.4,0.0


In [205]:
col.remove("Month")
col

['MinTemp',
 'MaxTemp',
 'Rainfall',
 'Evaporation',
 'Sunshine',
 'WindGustSpeed',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Temp9am',
 'Temp3pm']

In [206]:
from sklearn.preprocessing import StandardScaler

In [207]:
ss = StandardScaler()
ss = ss.fit(Xtrain.loc[:, col])

In [208]:
Xtrain.loc[:,col] = ss.transform(Xtrain.loc[:,col])
Xtest.loc[:,col] = ss.transform(Xtest.loc[:,col])

In [211]:
Xtrain.to_csv("Xtrain.csv")
Xtrain.to_csv("Xtrain.csv")
Ytest.to_csv("Ytest.csv")
Ytest.to_csv("Ytest.csv")

In [212]:
!ls

Cityclimate.csv  geckodriver.log  weatherAUS5000.csv  Xtrain.csv
cityll.csv	 samplecity.csv   weather.ipynb       Ytest.csv


In [219]:
# 建模
from time import time
import datetime
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, recall_score
from sklearn.svm import SVC

In [216]:
Ytrain = Ytrain.iloc[:, 0].ravel()
Ytest = Ytest.iloc[:, 0].ravel()

In [220]:
times = time()

for kernel in ["linear", "poly", "rbf", "sigmoid"]:
    clf = SVC(kernel=kernel
              ,gamma="auto"
              ,degree=1
              ,cache_size=5000
             ).fit(Xtrain, Ytrain)
    result = clf.predict(Xtest)
    score = clf.score(Xtest, Ytest)
    recall = recall_score(Ytest, result)
    auc = roc_auc_score(Ytest, clf.decision_function(Xtest))
    print("{} 's testing accuracy {}, recall is {}, auc is {}".format(kernel, score, recall, auc))
    print(datetime.datetime.fromtimestamp(time() - times).strftime("%M:%s:%f"))

linear 's testing accuracy 0.844, recall is 0.46938775510204084, auc is 0.8690289302534202
00:0:819686
poly 's testing accuracy 0.8406666666666667, recall is 0.4577259475218659, auc is 0.868157066506069
00:1:145438
rbf 's testing accuracy 0.8133333333333334, recall is 0.30612244897959184, auc is 0.814872584420853
00:1:885629
sigmoid 's testing accuracy 0.6553333333333333, recall is 0.15451895043731778, auc is 0.4373077049068794
00:2:365932


In [222]:
# 调参: 追求高召回率

times = time()

for kernel in ["linear", "poly", "rbf", "sigmoid"]:
    clf = SVC(kernel=kernel
              ,gamma="auto"
              ,degree=1
              ,cache_size=5000
              ,class_weight="balanced"
             ).fit(Xtrain, Ytrain)
    result = clf.predict(Xtest)
    score = clf.score(Xtest, Ytest)
    recall = recall_score(Ytest, result)
    auc = roc_auc_score(Ytest, clf.decision_function(Xtest))
    print("{} 's testing accuracy {}, recall is {}, auc is {}".format(kernel, score, recall, auc))
    print(datetime.datetime.fromtimestamp(time() - times).strftime("%M:%s:%f"))

linear 's testing accuracy 0.7966666666666666, recall is 0.7755102040816326, auc is 0.8700620635956569
00:1:020320
poly 's testing accuracy 0.7933333333333333, recall is 0.7638483965014577, auc is 0.8714479741767062
00:1:471410
rbf 's testing accuracy 0.8033333333333333, recall is 0.6005830903790087, auc is 0.8197131921048454
00:2:359189
sigmoid 's testing accuracy 0.562, recall is 0.282798833819242, auc is 0.43711871710037276
00:3:448449


In [224]:
# 限定 linear 核以后，可以 class_weight 给 {1: 100}

times = time()

for i in [10, 100]:
    clf = SVC(kernel="linear"
              ,gamma="auto"
              ,degree=1
              ,cache_size=5000
              ,class_weight={1:i}
             ).fit(Xtrain, Ytrain)
    result = clf.predict(Xtest)
    score = clf.score(Xtest, Ytest)
    recall = recall_score(Ytest, result)
    auc = roc_auc_score(Ytest, clf.decision_function(Xtest))
    print("{} 's testing accuracy {}, recall is {}, auc is {}".format(kernel, score, recall, auc))
    print(datetime.datetime.fromtimestamp(time() - times).strftime("%M:%s:%f"))

linear 's testing accuracy 0.6366666666666667, recall is 0.9125364431486881, auc is 0.866360422425545
00:1:776171
linear 's testing accuracy 0.22866666666666666, recall is 1.0, auc is 0.7860582435221279
00:17:954186


In [225]:
# 调参 ： 追求高准确率

from sklearn.metrics import confusion_matrix as CM

In [226]:
valuec = pd.Series(Ytest).value_counts()
valuec

0    1157
1     343
dtype: int64

In [227]:
valuec[0] / valuec.sum()

0.7713333333333333

In [228]:
# 特异度

clf = SVC(kernel="linear"
          ,gamma="auto"
          ,cache_size=5000
         ).fit(Xtrain, Ytrain)
result = clf.predict(Xtest)

In [229]:
cm = CM(Ytest, result, labels=(1,0))

In [230]:
cm

array([[ 161,  182],
       [  52, 1105]])

In [231]:
specificity = cm[1, 1] / cm[1, :].sum()

In [233]:
#几乎所有的0都被判断正确了,还有不少1也被判断正确了

specificity

0.9550561797752809

In [234]:
times = time()

for i in np.linspace(0.01, 0.05, 10):
    clf = SVC(kernel="linear"
              ,gamma="auto"
              ,degree=1
              ,cache_size=5000
              ,class_weight={1:1+i}
             ).fit(Xtrain, Ytrain)
    result = clf.predict(Xtest)
    score = clf.score(Xtest, Ytest)
    recall = recall_score(Ytest, result)
    auc = roc_auc_score(Ytest, clf.decision_function(Xtest))
    print("{} 's testing accuracy {}, recall is {}, auc is {}".format(kernel, score, recall, auc))
    print(datetime.datetime.fromtimestamp(time() - times).strftime("%M:%s:%f"))

linear 's testing accuracy 0.8446666666666667, recall is 0.4752186588921283, auc is 0.8691574419618447
00:0:774139
linear 's testing accuracy 0.8446666666666667, recall is 0.478134110787172, auc is 0.8691851601734657
00:1:567660
linear 's testing accuracy 0.8446666666666667, recall is 0.478134110787172, auc is 0.8691977593605661
00:2:337928
linear 's testing accuracy 0.8453333333333334, recall is 0.48104956268221577, auc is 0.8691750808237853
00:3:120404
linear 's testing accuracy 0.844, recall is 0.48104956268221577, auc is 0.869394306679333
00:3:930292
linear 's testing accuracy 0.844, recall is 0.48104956268221577, auc is 0.8695278580625978
00:4:702247
linear 's testing accuracy 0.844, recall is 0.48104956268221577, auc is 0.8696588896084425
00:5:513270
linear 's testing accuracy 0.8446666666666667, recall is 0.4839650145772595, auc is 0.8696286515594013
00:6:308527
linear 's testing accuracy 0.8446666666666667, recall is 0.4839650145772595, auc is 0.8697118061942644
00:7:095807
lin

In [235]:
# 换模型：线性模型 LR 好

from sklearn.linear_model import LogisticRegression as LR

logclf = LR(solver="liblinear").fit(Xtrain, Ytrain)

In [237]:
# 比 0.845 好
logclf.score(Xtest, Ytest)

0.8486666666666667

In [238]:
C_range = np.linspace(3,5,10)

for C in C_range:
    logclf = LR(solver="liblinear",C=C).fit(Xtrain, Ytrain)
    print(C,logclf.score(Xtest,Ytest))

3.0 0.8493333333333334
3.2222222222222223 0.8493333333333334
3.4444444444444446 0.8493333333333334
3.6666666666666665 0.8493333333333334
3.888888888888889 0.8493333333333334
4.111111111111111 0.8493333333333334
4.333333333333333 0.8493333333333334
4.555555555555555 0.8493333333333334
4.777777777777778 0.8493333333333334
5.0 0.8493333333333334


In [239]:
# 调参：均衡

# 调 C，画 ROC 算 AUC 选阈值