In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
weather = pd.read_csv("./weatherAUS5000.csv", index_col=0)

In [3]:
weather.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainTomorrow
0,2015-03-24,Adelaide,12.3,19.3,0.0,5.0,,S,39.0,S,...,19.0,59.0,47.0,1022.2,1021.4,,,15.1,17.7,No
1,2011-07-12,Adelaide,7.9,11.4,0.0,1.0,0.5,N,20.0,NNE,...,7.0,70.0,59.0,1028.7,1025.7,,,8.4,11.3,No
2,2010-02-08,Adelaide,24.0,38.1,0.0,23.4,13.0,SE,39.0,NNE,...,19.0,36.0,24.0,1018.0,1016.0,,,32.4,37.4,No
3,2016-09-19,Adelaide,6.7,16.4,0.4,,,N,31.0,N,...,15.0,65.0,40.0,1014.4,1010.0,,,11.2,15.9,No
4,2014-03-05,Adelaide,16.7,24.8,0.0,6.6,11.7,S,37.0,S,...,24.0,61.0,48.0,1019.3,1018.9,,,20.8,23.7,No


In [4]:
X = weather.iloc[:, :-1]

In [5]:
Y = weather.iloc[:, -1]

In [6]:
X.shape

(5000, 21)

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           5000 non-null   object 
 1   Location       5000 non-null   object 
 2   MinTemp        4979 non-null   float64
 3   MaxTemp        4987 non-null   float64
 4   Rainfall       4950 non-null   float64
 5   Evaporation    2841 non-null   float64
 6   Sunshine       2571 non-null   float64
 7   WindGustDir    4669 non-null   object 
 8   WindGustSpeed  4669 non-null   float64
 9   WindDir9am     4651 non-null   object 
 10  WindDir3pm     4887 non-null   object 
 11  WindSpeed9am   4949 non-null   float64
 12  WindSpeed3pm   4919 non-null   float64
 13  Humidity9am    4936 non-null   float64
 14  Humidity3pm    4880 non-null   float64
 15  Pressure9am    4506 non-null   float64
 16  Pressure3pm    4504 non-null   float64
 17  Cloud9am       3111 non-null   float64
 18  Cloud3pm

In [8]:
X.isnull().mean()

Date             0.0000
Location         0.0000
MinTemp          0.0042
MaxTemp          0.0026
Rainfall         0.0100
Evaporation      0.4318
Sunshine         0.4858
WindGustDir      0.0662
WindGustSpeed    0.0662
WindDir9am       0.0698
WindDir3pm       0.0226
WindSpeed9am     0.0102
WindSpeed3pm     0.0162
Humidity9am      0.0128
Humidity3pm      0.0240
Pressure9am      0.0988
Pressure3pm      0.0992
Cloud9am         0.3778
Cloud3pm         0.3976
Temp9am          0.0066
Temp3pm          0.0176
dtype: float64

In [9]:
# 标签分类
np.unique(Y)

array(['No', 'Yes'], dtype=object)

In [10]:
# 分集，描述性统计
# 先分集，再数据预处理
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, random_state=420)

# 恢复索引
for i in [Xtrain, Xtest, Ytrain, Ytest]:
    i.index = range(i.shape[0])

In [11]:
# 样本不均衡
Ytrain.value_counts()

No     2704
Yes     796
Name: RainTomorrow, dtype: int64

In [12]:
Ytest.value_counts()

No     1157
Yes     343
Name: RainTomorrow, dtype: int64

In [13]:
# 将标签编码
from sklearn.preprocessing import LabelEncoder

# 用训练集训练，在训练集和测试机上分别 transform
encoder = LabelEncoder().fit(Ytrain)
Ytrain = pd.DataFrame(encoder.transform(Ytrain))
Ytest = pd.DataFrame(encoder.transform(Ytest))

In [14]:
# 探索特征矩阵

Xtrain.describe([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]).T

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,99%,max
MinTemp,3486.0,12.225645,6.396243,-6.5,-1.715,1.8,4.1,7.7,12.0,16.7,20.9,25.9,29.0
MaxTemp,3489.0,23.245543,7.201839,-3.7,8.888,12.84,14.5,18.0,22.5,28.4,33.0,40.4,46.4
Rainfall,3467.0,2.487049,7.949686,0.0,0.0,0.0,0.0,0.0,0.0,0.8,6.6,41.272,115.8
Evaporation,1983.0,5.619163,4.383098,0.0,0.4,0.8,1.4,2.6,4.8,7.4,10.2,20.6,56.0
Sunshine,1790.0,7.508659,3.805841,0.0,0.0,0.345,1.4,4.6,8.3,10.6,12.0,13.3,13.9
WindGustSpeed,3263.0,39.858413,13.219607,9.0,15.0,20.0,24.0,31.0,39.0,48.0,57.0,76.0,117.0
WindSpeed9am,3466.0,14.046163,8.670472,0.0,0.0,0.0,4.0,7.0,13.0,19.0,26.0,37.0,65.0
WindSpeed3pm,3437.0,18.55339,8.611818,0.0,2.0,6.0,7.0,13.0,19.0,24.0,30.0,43.0,65.0
Humidity9am,3459.0,69.069095,18.787698,2.0,18.0,35.0,45.0,57.0,70.0,83.0,94.0,100.0,100.0
Humidity3pm,3408.0,51.651995,20.697872,2.0,9.0,17.0,23.0,37.0,52.0,66.0,79.0,98.0,100.0


In [15]:
Xtest.describe([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]).T

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,99%,max
MinTemp,1493.0,11.916812,6.375377,-8.5,-2.024,1.6,3.7,7.3,11.8,16.5,20.48,25.316,28.3
MaxTemp,1498.0,22.906809,6.986043,-0.8,9.134,13.0,14.5,17.8,22.4,27.8,32.6,38.303,45.1
Rainfall,1483.0,2.241807,7.988822,0.0,0.0,0.0,0.0,0.0,0.0,0.8,5.2,35.372,108.2
Evaporation,858.0,5.657809,4.105762,0.0,0.4,1.0,1.6,2.8,4.8,7.6,10.4,19.458,38.8
Sunshine,781.0,7.677465,3.862294,0.0,0.0,0.3,1.5,4.7,8.6,10.7,12.2,13.4,13.9
WindGustSpeed,1406.0,40.044097,14.027052,9.0,15.0,20.0,24.0,30.0,39.0,48.0,57.0,78.0,122.0
WindSpeed9am,1483.0,13.986514,9.124337,0.0,0.0,0.0,4.0,7.0,13.0,20.0,26.0,39.36,72.0
WindSpeed3pm,1482.0,18.601215,8.850446,0.0,2.0,6.0,7.0,13.0,19.0,24.0,31.0,43.0,56.0
Humidity9am,1477.0,68.688558,18.876448,4.0,20.0,36.0,44.0,57.0,69.0,82.0,95.0,100.0,100.0
Humidity3pm,1472.0,51.431386,20.459957,2.0,8.71,18.0,23.0,37.0,52.0,66.0,78.0,96.29,100.0


In [16]:
# 处理困难特征 -- 日期

Xtrainc = Xtrain.copy()

Xtrainc.sort_values(by="Location")

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
2796,2015-03-24,Adelaide,12.3,19.3,0.0,5.0,,S,39.0,S,...,13.0,19.0,59.0,47.0,1022.2,1021.4,,,15.1,17.7
2975,2012-08-17,Adelaide,7.8,13.2,17.6,0.8,,SW,61.0,SW,...,20.0,28.0,76.0,47.0,1012.5,1014.7,,,8.3,12.5
775,2013-03-16,Adelaide,17.4,23.8,,,9.7,SSE,46.0,S,...,9.0,19.0,63.0,57.0,1019.9,1020.5,,,19.1,20.7
861,2011-07-12,Adelaide,7.9,11.4,0.0,1.0,0.5,N,20.0,NNE,...,7.0,7.0,70.0,59.0,1028.7,1025.7,,,8.4,11.3
2906,2015-08-24,Adelaide,9.2,14.3,0.0,,,SE,48.0,SE,...,17.0,19.0,64.0,42.0,1024.7,1024.1,,,9.9,13.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2223,2009-05-08,Woomera,9.2,20.6,0.0,5.2,10.4,ESE,37.0,SE,...,19.0,19.0,64.0,34.0,1030.5,1026.9,0.0,1.0,13.7,20.1
1984,2014-05-26,Woomera,15.5,23.6,0.0,24.0,,NNW,43.0,NNE,...,9.0,26.0,49.0,37.0,1014.2,1010.3,7.0,7.0,18.0,21.5
1592,2012-01-10,Woomera,16.8,26.7,0.0,10.0,5.3,SW,46.0,S,...,20.0,22.0,52.0,33.0,1019.1,1016.8,4.0,6.0,18.3,24.9
2824,2015-11-03,Woomera,16.2,28.5,7.8,4.2,4.5,WSW,80.0,NE,...,26.0,50.0,76.0,53.0,1009.6,1006.8,6.0,7.0,20.5,26.2


In [17]:
Xtrain.iloc[:,0].value_counts()

2015-07-03    6
2014-05-16    6
2015-10-12    6
2010-11-03    5
2011-09-04    5
             ..
2013-04-08    1
2015-10-10    1
2010-02-01    1
2011-06-20    1
2013-12-30    1
Name: Date, Length: 2141, dtype: int64

In [18]:
Xtrain["Rainfall"].head(20)

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
5      0.0
6      0.0
7      0.2
8      0.0
9      0.2
10     1.0
11     0.0
12     0.2
13     0.0
14     0.0
15     3.0
16     0.2
17     0.0
18    35.2
19     0.0
Name: Rainfall, dtype: float64

In [19]:
Xtrain.loc[Xtrain["Rainfall"] >= 1, "RainToday"] = "Yes"
Xtrain.loc[Xtrain["Rainfall"] < 1, "RainToday"] = "No"
Xtrain.loc[Xtrain["Rainfall"] == np.nan, "RainToday"] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [20]:
Xtest.loc[Xtest["Rainfall"] >= 1, "RainToday"] = "Yes"
Xtest.loc[Xtest["Rainfall"] < 1, "RainToday"] = "No"
Xtest.loc[Xtest["Rainfall"] == np.nan, "RainToday"] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [21]:
Xtrain.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,2015-08-24,Katherine,17.5,36.0,0.0,8.8,,ESE,26.0,NNW,...,15.0,57.0,,1016.8,1012.2,0.0,,27.5,,No
1,2016-12-10,Tuggeranong,9.5,25.0,0.0,,,NNW,33.0,NE,...,17.0,59.0,31.0,1020.4,1017.5,,,14.6,23.6,No
2,2010-04-18,Albany,13.0,22.6,0.0,3.8,10.4,,,NE,...,31.0,79.0,68.0,1020.3,1015.7,1.0,3.0,17.5,20.8,No
3,2009-11-26,Sale,13.9,29.8,0.0,5.8,5.1,S,37.0,N,...,28.0,82.0,44.0,1012.5,1005.9,6.0,6.0,18.5,27.5,No
4,2014-04-25,Mildura,6.0,23.5,0.0,2.8,8.6,NNE,24.0,E,...,15.0,58.0,35.0,1019.8,1014.1,2.0,4.0,12.4,22.4,No


In [22]:
Xtest.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,2016-01-23,NorahHead,22.0,27.8,25.2,,,SSW,57.0,S,...,37.0,91.0,86.0,1006.6,1008.1,,,26.2,23.1,Yes
1,2009-03-05,MountGambier,12.0,18.6,2.2,3.0,7.8,SW,52.0,SW,...,28.0,88.0,62.0,1020.2,1019.9,8.0,7.0,14.8,17.5,Yes
2,2010-03-05,MountGinini,9.1,13.3,,,,NE,41.0,,...,,,,,,,,,,
3,2013-10-26,Wollongong,13.1,20.3,0.0,,,SW,33.0,W,...,24.0,40.0,51.0,1021.3,1019.5,,,16.8,19.6,No
4,2016-11-28,Sale,12.2,20.0,0.4,,,E,33.0,SW,...,19.0,92.0,69.0,1015.6,1013.2,8.0,4.0,13.6,19.0,No


In [26]:
# 提取月份信息

int(Xtrain.loc[0, "Date"].split("-")[1])

12

In [28]:
Xtrain.loc[:, "Date"] = Xtrain.loc[:, "Date"].apply(lambda x:int(x.split("-")[1]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [30]:
Xtrain = Xtrain.rename(columns={"Date": "Month"})

In [31]:
Xtrain

Unnamed: 0,Month,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,8,Katherine,17.5,36.0,0.0,8.8,,ESE,26.0,NNW,...,15.0,57.0,,1016.8,1012.2,0.0,,27.5,,No
1,12,Tuggeranong,9.5,25.0,0.0,,,NNW,33.0,NE,...,17.0,59.0,31.0,1020.4,1017.5,,,14.6,23.6,No
2,4,Albany,13.0,22.6,0.0,3.8,10.4,,,NE,...,31.0,79.0,68.0,1020.3,1015.7,1.0,3.0,17.5,20.8,No
3,11,Sale,13.9,29.8,0.0,5.8,5.1,S,37.0,N,...,28.0,82.0,44.0,1012.5,1005.9,6.0,6.0,18.5,27.5,No
4,4,Mildura,6.0,23.5,0.0,2.8,8.6,NNE,24.0,E,...,15.0,58.0,35.0,1019.8,1014.1,2.0,4.0,12.4,22.4,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3495,9,NorahHead,16.4,23.9,0.0,,,NNE,50.0,N,...,30.0,77.0,80.0,1029.6,1024.4,,,20.1,20.3,No
3496,4,Wollongong,18.3,21.4,22.6,,,SSW,52.0,SSW,...,9.0,87.0,89.0,1018.0,1016.7,2.0,8.0,19.6,18.3,Yes
3497,11,Witchcliffe,12.4,32.2,0.0,,,N,61.0,N,...,30.0,29.0,25.0,1015.5,1014.7,,,27.4,32.1,No
3498,1,Watsonia,16.1,23.6,0.0,12.8,11.4,W,50.0,WSW,...,22.0,56.0,37.0,1007.7,1011.2,6.0,2.0,18.0,22.8,No


In [32]:
Xtest.loc[:, "Date"] = Xtest.loc[:, "Date"].apply( lambda x: x.split("-")[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [33]:
Xtest = Xtest.rename(columns={"Date": "Month"})

In [None]:
# 处理困难特征：地点