## 数据运营不得不知的的数据处理经验

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import statistics

from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA

## 缺失值的处理

In [2]:

# 生成缺失数据
df=pd.DataFrame(np.random.randn(100,4),columns=['col1','col2','col3','col4'])
df.iloc[1:2,1]=np.nan
df.iloc[4,3]=np.nan
print(df)

        col1      col2      col3      col4
0  -1.304651  0.923036  0.386671  0.283213
1  -0.739947       NaN -0.238299 -0.283137
2   0.152713 -0.631102  0.215356 -0.244788
3   1.374712  1.135909  0.234408  0.462964
4  -0.157082 -0.996825  1.951375       NaN
..       ...       ...       ...       ...
95 -1.633138 -0.359680 -1.274154 -1.130256
96  0.451450  1.189453 -1.336748 -0.299288
97 -0.547945  1.310752 -0.998328 -0.615632
98  0.171874 -1.482310 -2.203544  1.249701
99 -0.120466 -0.317151 -2.162981 -0.170375

[100 rows x 4 columns]


In [3]:
# 查看那些只缺失
nan_all=df.isnull()
print(nan_all)

     col1   col2   col3   col4
0   False  False  False  False
1   False   True  False  False
2   False  False  False  False
3   False  False  False  False
4   False  False  False   True
..    ...    ...    ...    ...
95  False  False  False  False
96  False  False  False  False
97  False  False  False  False
98  False  False  False  False
99  False  False  False  False

[100 rows x 4 columns]


In [4]:
nan_col1=df.isnull().any()
nan_col2=df.isnull().all()

In [5]:
## 获得含有na的列
print(nan_col1)

col1    False
col2     True
col3    False
col4     True
dtype: bool


In [6]:
# 获得全部为na的列
print(nan_col2)

col1    False
col2    False
col3    False
col4    False
dtype: bool


In [7]:
df2=df.dropna()
print(df2)

        col1      col2      col3      col4
0  -1.304651  0.923036  0.386671  0.283213
2   0.152713 -0.631102  0.215356 -0.244788
3   1.374712  1.135909  0.234408  0.462964
5   0.390509  0.257922  0.263039  1.478146
6  -0.298413  1.257356 -0.341082 -1.199408
..       ...       ...       ...       ...
95 -1.633138 -0.359680 -1.274154 -1.130256
96  0.451450  1.189453 -1.336748 -0.299288
97 -0.547945  1.310752 -0.998328 -0.615632
98  0.171874 -1.482310 -2.203544  1.249701
99 -0.120466 -0.317151 -2.162981 -0.170375

[98 rows x 4 columns]


In [8]:
df2.shape

(98, 4)

In [9]:
df.shape

(100, 4)

### sklearn.preprocessing.Imputer¶

https://scikit-learn.org/0.15/modules/generated/sklearn.preprocessing.Imputer.html
    
https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html

In [10]:
## 将缺失值替换为特定值 
## 将值为Nan的缺失值用均值做替换
nan_model=SimpleImputer(missing_values=np.nan,strategy='mean')
nan_result=nan_model.fit(df)
print(nan_result)

SimpleImputer()


In [11]:
print(df)

        col1      col2      col3      col4
0  -1.304651  0.923036  0.386671  0.283213
1  -0.739947       NaN -0.238299 -0.283137
2   0.152713 -0.631102  0.215356 -0.244788
3   1.374712  1.135909  0.234408  0.462964
4  -0.157082 -0.996825  1.951375       NaN
..       ...       ...       ...       ...
95 -1.633138 -0.359680 -1.274154 -1.130256
96  0.451450  1.189453 -1.336748 -0.299288
97 -0.547945  1.310752 -0.998328 -0.615632
98  0.171874 -1.482310 -2.203544  1.249701
99 -0.120466 -0.317151 -2.162981 -0.170375

[100 rows x 4 columns]


In [12]:
statistics.mean([0.40508527, 0.04641579 , 1.12043472,0])

0.392983945

In [13]:
statistics.mean([-1.208784,0, -0.903849 , 0.020395])

-0.5230595

## 异常值的处理

In [14]:
df1= pd.DataFrame({'col1':[1,120,3,5,2,12,13], 'col2':[12,17,31,53,22,32,43]})
print(df1)

   col1  col2
0     1    12
1   120    17
2     3    31
3     5    53
4     2    22
5    12    32
6    13    43


In [15]:
## 通过z-score方法判断异常值
##获得数据框的列名
## 循环读取每一列
## 得到每列的值
# 计算每一列z-score的得分
## 判断z-score是否大于2.2


df1_zscore=df1.copy()
cols1=df1.columns
for col in cols1:
    df1_col=df1[col]
    z_score=(df1_col-df1_col.mean())/df1_col.std()
    df1_zscore[col]=z_score.abs()> 2.2
print(df1_zscore)

    col1   col2
0  False  False
1   True  False
2  False  False
3  False  False
4  False  False
5  False  False
6  False  False


本示例方法中，阀值的设定是确定异常与否的关键，通常当阀值大于2时，已经是相对异常的表现值。
上述过程中，主要需要考虑的关键点是的：如何判断异常值。对于有guiding业务规则的可直接套用业务规则，而对于没有固定业务规则的，可以采用常见
的数学模型进行判断，即基于概率分布的模型（例如正态分布的标准差范围），基于类聚的方法（例如kmeans),基于密度的方法（例如lof),机遇分类的方法
（例如knn),机遇统计的方法（例如分为数法）等。


## 重复性处理

In [16]:
data1=['a',3]
data2=['b',2]
data3=['a',3]
data4=['c',2]

df2=pd.DataFrame([data1,data2,data3,data4],columns=['col1','col2'])
print(df2)

  col1  col2
0    a     3
1    b     2
2    a     3
3    c     2


In [17]:
# 判断重复数据

isDuplicated= df2.duplicated()
print(isDuplicated)

0    False
1    False
2     True
3    False
dtype: bool


In [18]:
## 切除重复值

new_df1=df2.drop_duplicates()
new_df2=df2.drop_duplicates(['col1'])
new_df3=df2.drop_duplicates(['col2'])
new_df4=df2.drop_duplicates(['col1','col2'])

print (new_df1);
print (new_df2);
print (new_df3);
print (new_df4);

  col1  col2
0    a     3
1    b     2
3    c     2
  col1  col2
0    a     3
1    b     2
3    c     2
  col1  col2
0    a     3
1    b     2
  col1  col2
0    a     3
1    b     2
3    c     2


## 将分类数据和顺序数据转换为标志变量

In [19]:
df3=pd.DataFrame({'id':[3566841,6541227,3512441], 'sex':['Male','Female','Female'],'level':['high','low','middle']})
print(df3)

        id     sex   level
0  3566841    Male    high
1  6541227  Female     low
2  3512441  Female  middle


In [20]:
df_new=df3.copy()
for col_num, col_name in enumerate(df3):
    col_data=df3[col_name]
    col_dtype=col_data.dtype
    if col_dtype == 'object':
        df_new=df_new.drop(col_name,1)
        value_sets=col_data.unique()
        for value_unique in value_sets:
            col_name_new = col_name + '_'+value_unique
            col_tmp=df3.iloc[:, col_num]
            new_col=(col_tmp == value_unique)
            df_new[col_name_new]=new_col
print(df_new)

        id  sex_Male  sex_Female  level_high  level_low  level_middle
0  3566841      True       False        True      False         False
1  6541227     False        True       False       True         False
2  3512441     False        True       False      False          True


In [21]:
# 使用sklearn进行标志转换

from sklearn.preprocessing import OneHotEncoder

df10=pd.DataFrame({'id':[3566841,6541227,3512441], 'sex':[1,2,2],'level':[3,1,2]})

id_data=df10.values[:, :1]
transform_data=df10.values[:, 1:]
enc=OneHotEncoder(handle_unknown='ignore')
df10_new=enc.fit_transform(transform_data).toarray()
df10_all=pd.concat((pd.DataFrame(id_data),pd.DataFrame(df10_new)),axis=1)
print(df10_all)


         0    0    1    2    3    4
0  3566841  1.0  0.0  0.0  0.0  1.0
1  6541227  0.0  1.0  1.0  0.0  0.0
2  3512441  0.0  1.0  0.0  1.0  0.0


In [22]:
df10

Unnamed: 0,id,sex,level
0,3566841,1,3
1,6541227,2,1
2,3512441,2,2


In [23]:
transform_data

array([[1, 3],
       [2, 1],
       [2, 2]])

In [24]:
df10_new

array([[1., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0.],
       [0., 1., 0., 1., 0.]])

##  大数据时代数据化运营降维

In [25]:

import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA

# 读取数据文件
data = np.loadtxt('/Users/balance/Desktop/python data analysis concept reference/data1.txt')  # 读取文本数据文件
x = data[:, :-1]  # 获得输入的x
y = data[:, -1]  # 获得目标变量y
print (x[0], y[0])  # 打印输出x和y的第一条记录

# 使用sklearn的DecisionTreeClassifier判断变量重要性
model_tree = DecisionTreeClassifier(random_state=0)  # 建立分类决策树模型对象
model_tree.fit(x, y)  # 将数据集的维度和目标变量输入模型
feature_importance = model_tree.feature_importances_  # 获得所有变量的重要性得分
print (feature_importance)  # 打印输出

# 使用sklearn的PCA进行维度转换
model_pca = PCA()  # 建立PCA模型对象
model_pca.fit(x)  # 将数据集输入模型
model_pca.transform(x)  # 对数据集进行转换映射
components = model_pca.components_  # 获得转换后的所有主成分
components_var = model_pca.explained_variance_  # 获得各主成分的方差
components_var_ratio = model_pca.explained_variance_ratio_  # 获得各主成分的方差占比
print (components[:2])  # 打印输出前2个主成分
print (components_var[:2])  # 打印输出前2个主成分的方差
print (components_var_ratio)  # 打印输出所有主成分的方差占比

[ 1.88622997  1.31785876 -0.16480621  0.56536882 -1.11934542 -0.53218995
 -0.6843102   1.24149827  1.00579225  0.45485041] 0.0
[0.03331054 0.01513967 0.02199713 0.119727   0.47930312 0.04776297
 0.17111746 0.02585441 0.02012725 0.06566044]
[[ 7.18818316e-03  1.41619205e-02  1.00543847e-02  3.65097575e-01
   6.38944537e-01 -1.95750380e-02 -1.73413378e-01 -3.80829974e-02
  -2.87413113e-03 -6.52829504e-01]
 [ 1.01307710e-02 -1.95270201e-04 -2.33689543e-02 -6.12915216e-01
   5.08983971e-01 -2.23429533e-02  6.02958940e-01 -1.49061329e-02
  -1.81362216e-02 -3.41623971e-03]]
[4.22602937 2.21149972]
[3.38339364e-01 1.77054475e-01 8.92753857e-02 8.73655166e-02
 8.23542686e-02 8.03329836e-02 7.38094896e-02 7.14685179e-02
 3.31291533e-32 4.66241032e-33]


In [26]:
data.shape

(1000, 11)

In [27]:
x,y

(array([[ 1.88622997,  1.31785876, -0.16480621, ...,  1.24149827,
          1.00579225,  0.45485041],
        [ 0.45016257,  0.67080853, -1.16571355, ..., -0.94946505,
         -0.33194209, -2.94399437],
        [ 0.48158666,  0.33524676,  0.72210929, ...,  1.46919579,
         -1.68387822,  1.44933243],
        ...,
        [ 0.36101381, -1.10383044,  1.2656558 , ...,  0.96446047,
          2.61770545, -1.14168056],
        [ 0.88695058,  0.75897958, -1.16120866, ...,  1.1150845 ,
         -1.57791648, -1.52150761],
        [ 0.15444084, -0.27528562,  0.77045662, ...,  0.03157817,
         -0.54756984,  0.95966265]]),
 array([0., 1., 1., 1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 1., 0., 1., 0.,
        0., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0.,
        1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 0., 0., 1., 1., 1., 0., 0.,
        0., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 1., 1., 1., 0., 1., 0.,
        0., 1., 1., 1., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0

In [28]:
data

array([[ 1.88622997,  1.31785876, -0.16480621, ...,  1.00579225,
         0.45485041,  0.        ],
       [ 0.45016257,  0.67080853, -1.16571355, ..., -0.33194209,
        -2.94399437,  1.        ],
       [ 0.48158666,  0.33524676,  0.72210929, ..., -1.68387822,
         1.44933243,  1.        ],
       ...,
       [ 0.36101381, -1.10383044,  1.2656558 , ...,  2.61770545,
        -1.14168056,  1.        ],
       [ 0.88695058,  0.75897958, -1.16120866, ..., -1.57791648,
        -1.52150761,  0.        ],
       [ 0.15444084, -0.27528562,  0.77045662, ..., -0.54756984,
         0.95966265,  0.        ]])

In [29]:
print (x[0], y[0])  # 打印输出x和y的第一条记录

[ 1.88622997  1.31785876 -0.16480621  0.56536882 -1.11934542 -0.53218995
 -0.6843102   1.24149827  1.00579225  0.45485041] 0.0


In [30]:
print (feature_importance)  # 打印输出变量重要性

[0.03331054 0.01513967 0.02199713 0.119727   0.47930312 0.04776297
 0.17111746 0.02585441 0.02012725 0.06566044]


In [31]:
print (components[:2])# 打印输出前2个主成分

[[ 7.18818316e-03  1.41619205e-02  1.00543847e-02  3.65097575e-01
   6.38944537e-01 -1.95750380e-02 -1.73413378e-01 -3.80829974e-02
  -2.87413113e-03 -6.52829504e-01]
 [ 1.01307710e-02 -1.95270201e-04 -2.33689543e-02 -6.12915216e-01
   5.08983971e-01 -2.23429533e-02  6.02958940e-01 -1.49061329e-02
  -1.81362216e-02 -3.41623971e-03]]


In [32]:
print (components_var[:2]) # 打印输出前2个主成分的方差

[4.22602937 2.21149972]


In [33]:
print (components_var_ratio)  # 打印输出所有主成分的方差占比

[3.38339364e-01 1.77054475e-01 8.92753857e-02 8.73655166e-02
 8.23542686e-02 8.03329836e-02 7.38094896e-02 7.14685179e-02
 3.31291533e-32 4.66241032e-33]


## 解决样本类别分布不均衡的问题

In [43]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.8.0-py3-none-any.whl (206 kB)
[K     |████████████████████████████████| 206 kB 1.2 MB/s eta 0:00:01
Collecting scikit-learn>=0.24
  Downloading scikit_learn-1.0-cp37-cp37m-macosx_10_13_x86_64.whl (7.9 MB)
[K     |████████████████████████████████| 7.9 MB 4.9 MB/s eta 0:00:01
Installing collected packages: scikit-learn, imbalanced-learn, imblearn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.23.2
    Uninstalling scikit-learn-0.23.2:
      Successfully uninstalled scikit-learn-0.23.2
Successfully installed imbalanced-learn-0.8.0 imblearn-0.0 scikit-learn-1.0


In [45]:
pip install delayed

Collecting delayed
  Downloading delayed-0.11.0b1-py2.py3-none-any.whl (19 kB)
Collecting redis
  Downloading redis-3.5.3-py2.py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 924 kB/s eta 0:00:01
[?25hCollecting hiredis
  Downloading hiredis-2.0.0-cp37-cp37m-macosx_10_9_x86_64.whl (24 kB)
Installing collected packages: redis, hiredis, delayed
Successfully installed delayed-0.11.0b1 hiredis-2.0.0 redis-3.5.3
Note: you may need to restart the kernel to use updated packages.


In [54]:


from imblearn.over_sampling import SMOTE  # 过抽样处理库SMOTE
from imblearn.under_sampling import RandomUnderSampler  # 欠抽样处理库RandomUnderSampler
from sklearn.svm import SVC  # SVM中的分类算法SVC
from imblearn.ensemble import EasyEnsembleClassifier  # 简单集成方法EasyEnsemble



# 导入数据文件
bjh = pd.read_table('/Users/balance/Desktop/python data analysis concept reference/data2.txt', sep=' ', names=['col1', 'col2', 'col3', 'col4', 'col5', 'label'])  # 读取数据文件
x_bjh = bjh.iloc[:, :-1]  # 切片，得到输入x
y_bjh = bjh.iloc[:, -1]  # 切片，得到标签y
groupby_data_orgianl = bjh.groupby('label').count()  # 对label做分类汇总
print (groupby_data_orgianl)  # 打印输出原始数据集样本分类分布

       col1  col2  col3  col4  col5
label                              
0.0     942   942   942   942   942
1.0      58    58    58    58    58


In [36]:
bjh

Unnamed: 0,col1,col2,col3,col4,col5,label
0,1.484710,-0.567953,0.957248,-1.028711,-1.143092,0.0
1,-1.021847,-0.209943,0.585039,-0.625991,-0.532367,0.0
2,-0.418176,0.878976,-0.333351,0.371767,1.223683,0.0
3,0.441664,1.405135,2.334376,-2.453234,0.594130,0.0
4,-1.855857,1.342565,1.457096,-1.522041,0.935034,0.0
...,...,...,...,...,...,...
995,0.713214,-0.724003,1.216396,-1.307252,-1.455331,0.0
996,-0.669137,-1.204326,1.871553,-2.013140,-2.348712,0.0
997,-0.697967,0.844773,-1.424127,1.530442,1.700386,1.0
998,0.471606,-0.665115,1.250725,-1.342570,-1.400265,0.0


In [39]:
# 使用SMOTE方法进行过抽样处理
model_smote = SMOTE()  # 建立SMOTE模型对象
x_smote_resampled, y_smote_resampled = model_smote.fit_resample(x_bjh, y_bjh)  # 输入数据并作过抽样处理
x_smote_resampled = pd.DataFrame(x_smote_resampled, columns=['col1', 'col2', 'col3', 'col4', 'col5'])  # 将数据转换为数据框并命名列名
y_smote_resampled = pd.DataFrame(y_smote_resampled, columns=['label'])  # 将数据转换为数据框并命名列名
smote_resampled = pd.concat([x_smote_resampled, y_smote_resampled], axis=1)  # 按列合并数据框
groupby_data_smote = smote_resampled.groupby('label').count()  # 对label做分类汇总
print (groupby_data_smote)  # 打印输出经过SMOTE处理后的数据集样本分类分布

       col1  col2  col3  col4  col5
label                              
0.0     942   942   942   942   942
1.0     942   942   942   942   942


In [92]:
# 使用RandomUnderSampler方法进行欠抽样处理
model_RandomUnderSampler = RandomUnderSampler()  # 建立RandomUnderSampler模型对象
x_RandomUnderSampler_resampled, y_RandomUnderSampler_resampled = model_RandomUnderSampler.fit_resample(x_bjh, y_bjh)  # 输入数据并作欠抽样处理
x_RandomUnderSampler_resampled = pd.DataFrame(x_RandomUnderSampler_resampled,
                                              columns=['col1', 'col2', 'col3', 'col4', 'col5'])  # 将数据转换为数据框并命名列名
y_RandomUnderSampler_resampled = pd.DataFrame(y_RandomUnderSampler_resampled, columns=['label'])  # 将数据转换为数据框并命名列名
RandomUnderSampler_resampled = pd.concat([x_RandomUnderSampler_resampled, y_RandomUnderSampler_resampled],
                                         axis=1)  # 按列合并数据框
groupby_data_RandomUnderSampler = RandomUnderSampler_resampled.groupby('label').count()  # 对label做分类汇总
print (groupby_data_RandomUnderSampler)  # 打印输出经过RandomUnderSampler处理后的数据集样本分类分布

       col1  col2  col3  col4  col5
label                              
0.0      58    58    58    58    58
1.0      58    58    58    58    58


In [93]:
# 使用SVM的权重调节处理不均衡样本
model_svm = SVC(class_weight='balanced')  # 创建SVC模型对象并指定类别权重
model_svm.fit(x_bjh, y_bjh)  # 输入x和y并训练模型

SVC(class_weight='balanced')

In [94]:
x_bjh

Unnamed: 0,col1,col2,col3,col4,col5
0,1.484710,-0.567953,0.957248,-1.028711,-1.143092
1,-1.021847,-0.209943,0.585039,-0.625991,-0.532367
2,-0.418176,0.878976,-0.333351,0.371767,1.223683
3,0.441664,1.405135,2.334376,-2.453234,0.594130
4,-1.855857,1.342565,1.457096,-1.522041,0.935034
...,...,...,...,...,...
995,0.713214,-0.724003,1.216396,-1.307252,-1.455331
996,-0.669137,-1.204326,1.871553,-2.013140,-2.348712
997,-0.697967,0.844773,-1.424127,1.530442,1.700386
998,0.471606,-0.665115,1.250725,-1.342570,-1.400265


In [95]:
y_bjh

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
995    0.0
996    0.0
997    1.0
998    0.0
999    0.0
Name: label, Length: 1000, dtype: float64

## 解决运营数据的共线性问题

In [109]:
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression

In [110]:
# 读取数据
data_gxx = np.loadtxt('/Users/balance/Desktop/python data analysis concept reference/data5.txt', delimiter='\t')  # 读取数据文件
x_gxx = data_gxx[:, :-1]  # 切分自变量
y_gxx = data_gxx[:, -1]  # 切分预测变量

In [111]:
# 使用岭回归算法进行回归分析
model_ridge = Ridge(alpha=1.0)  # 建立岭回归模型对象
model_ridge.fit(x_gxx, y_gxx)  # 输入x/y训练模型
print (model_ridge.coef_)  # 打印输出自变量的系数
print (model_ridge.intercept_)  # 打印输出截距

[ 8.50164360e+01 -1.18330186e-03  9.80792921e-04 -8.54201056e-04
  2.10489064e-05  2.20180449e-04 -3.00990875e-06 -9.30084240e-06
 -2.84498824e-08]
-7443.986528680895


In [116]:
x_gxx

array([[9.74500000e+01, 6.91560000e+02, 3.17747000e+03, ...,
        2.50230850e+05, 1.48545545e+06, 1.48517277e+06],
       [1.01760000e+02, 9.20360000e+02, 1.57680100e+04, ...,
        6.39772285e+06, 5.72076300e+05, 5.71978860e+05],
       [9.82100000e+01, 8.94760000e+02, 8.82403000e+03, ...,
        1.61567885e+06, 1.05798051e+06, 1.05838664e+06],
       ...,
       [1.00580000e+02, 5.22320000e+02, 7.88055000e+03, ...,
        5.72337040e+05, 2.25482960e+05, 4.10655250e+05],
       [9.95600000e+01, 5.03000000e+02, 4.02205000e+03, ...,
        7.15016400e+05, 2.08891280e+05, 3.95942710e+05],
       [1.00420000e+02, 7.14510000e+02, 7.79795000e+03, ...,
        2.77508514e+06, 5.97421660e+05, 2.26286140e+05]])

In [114]:
y_gxx

array([ 812.46, 1185.68,  894.41,  995.97,  908.55, 1077.18, 1198.11,
       1045.43,  999.64, 1120.31,  998.51,  975.7 , 1134.36,  963.95,
       1213.37, 1058.94, 1105.75, 1208.89, 1075.08,  903.12, 1021.3 ,
       1002.86, 1062.18, 1054.16, 1111.76, 1122.69, 1038.08, 1039.56,
       1077.71, 1110.26,  974.37, 1182.96, 1028.33, 1043.62, 1057.61,
       1047.95, 1076.97,  981.7 , 1181.97, 1057.11,  989.06, 1046.9 ,
        986.8 ,  905.75, 1233.18, 1003.55,  912.03, 1167.41, 1074.36,
       1018.34, 1179.38, 1195.93,  998.36,  954.54, 1031.69,  944.14,
       1214.54, 1139.28, 1102.34, 1013.25, 1089.18, 1079.66, 1100.71,
       1082.08, 1232.92, 1163.92, 1122.71,  980.16,  915.29, 1081.61,
        863.89, 1118.91,  975.68, 1081.99, 1087.6 , 1114.33, 1048.  ,
        983.05, 1072.7 , 1064.42, 1201.38,  997.15, 1075.52, 1171.63,
       1117.33,  894.53,  990.35,  991.83, 1093.31, 1115.44,  950.08,
       1030.41, 1039.31, 1094.33, 1092.28, 1078.03, 1112.72,  984.55,
        984.92, 1220

In [123]:
# 使用主成分回归进行回归分析
model_pca = PCA()  # 建立PCA模型对象
data_pca = model_pca.fit_transform(x_gxx)  # 将x进行主成分分析
ratio_cumsm = np.cumsum(model_pca.explained_variance_ratio_)  # 得到所有主成分方差占比的累积数据
print (ratio_cumsm)  # 打印输出所有主成分方差占比累积
rule_index = np.where(ratio_cumsm > 0.8)  # 获取方差占比超过0.8的所有索引值
min_index = rule_index[0][0]  # 获取最小索引值
data_pca_result = data_pca[:, :min_index + 1]  # 根据最小索引值提取主成分
model_liner = LinearRegression()  # 建立回归模型对象
model_liner.fit(data_pca_result, y_gxx)  # 输入主成分数据和预测变量y并训练模型
print (model_liner.coef_)  # 打印输出自变量的系数

[0.9028     0.98570494 0.99957412 0.99995908 0.99999562 0.99999939
 0.99999999 1.         1.        ]
[1.26262171e-05]


In [125]:
print (ratio_cumsm)  # 打印输出所有主成分方差占比累积

[0.9028     0.98570494 0.99957412 0.99995908 0.99999562 0.99999939
 0.99999999 1.         1.        ]


In [126]:
print (model_liner.coef_)  # 打印输出自变量的系数

[1.26262171e-05]


In [124]:
print (model_liner.intercept_)  # 打印输出截距

1058.52726


## 有关相关性分析的混沌

In [127]:


data_xgxhd = np.loadtxt('/Users/balance/Desktop/python data analysis concept reference/data5.txt', delimiter='\t')  # 读取数据文件
x_xgxhd = data_xgxhd[:, :-1]  # 切分自变量
correlation_matrix = np.corrcoef(x_xgxhd, rowvar=0)  # 相关性分析
print (correlation_matrix.round(2))  # 打印输出相关性结果

[[ 1.   -0.04  0.27 -0.05  0.21 -0.05  0.19 -0.03 -0.02]
 [-0.04  1.   -0.01  0.73 -0.01  0.62  0.    0.48  0.51]
 [ 0.27 -0.01  1.   -0.01  0.72 -0.    0.65  0.01  0.02]
 [-0.05  0.73 -0.01  1.    0.01  0.88  0.01  0.7   0.72]
 [ 0.21 -0.01  0.72  0.01  1.    0.02  0.91  0.03  0.03]
 [-0.05  0.62 -0.    0.88  0.02  1.    0.03  0.83  0.82]
 [ 0.19  0.    0.65  0.01  0.91  0.03  1.    0.03  0.03]
 [-0.03  0.48  0.01  0.7   0.03  0.83  0.03  1.    0.71]
 [-0.02  0.51  0.02  0.72  0.03  0.82  0.03  0.71  1.  ]]
