## pandas.fillna()

In [1]:
import pandas as pd

df = pd.DataFrame({'A': [1, 2, None, 4]})
df_filled = df.fillna(0)
print(df_filled)

     A
0  1.0
1  2.0
2  0.0
3  4.0


In [3]:
# 使用前一个非缺失值进行填充
df_filled = df.fillna(method='ffill')
print(df_filled)

     A
0  1.0
1  2.0
2  2.0
3  4.0


In [4]:
# 使用后一个非缺失值进行填充
df_filled = df.fillna(method='bfill')
print(df_filled)

     A
0  1.0
1  2.0
2  4.0
3  4.0


## pandas.drop()

In [7]:
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
print(df)
new_df = df.drop([0])  # 删除索引为0的行
new_df

   A  B
0  1  4
1  2  5
2  3  6


Unnamed: 0,A,B
1,2,5
2,3,6


In [13]:
df = pd.DataFrame({'A': [1, 2, np.nan], 'B': [4, np.nan, 6]})
print(df)
new_df = df.dropna()  # 删除含有缺失值的行
new_df

     A    B
0  1.0  4.0
1  2.0  NaN
2  NaN  6.0


Unnamed: 0,A,B
0,1.0,4.0


In [15]:
df = pd.DataFrame({'A': [1, 2, 2, 1], 'B': [4, 5, 5, 6]})
print(df)
new_df = df.drop_duplicates()  # 删除重复的行
new_df

   A  B
0  1  4
1  2  5
2  2  5
3  1  6


Unnamed: 0,A,B
0,1,4
1,2,5
3,1,6


## pandas.interpolate()

In [17]:
import pandas as pd

# 创建一个包含缺失值的 Series
s = pd.Series([1, np.nan, 3, np.nan, 5])

# 使用 interpolate() 方法进行线性插值
new_s = s.interpolate()

new_s

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
dtype: float64

## pandas.quantile()

In [18]:
import pandas as pd

data = [10, 20, 30, 40, 50, 60, 70, 80, 90]
series = pd.Series(data)

# 计算中位数
median = series.quantile(0.5)
print("Median:", median)

# 计算第一四分位数和第三四分位数
quartiles = series.quantile([0.25, 0.75])
print("Quartiles:")
print(quartiles)

Median: 50.0
Quartiles:
0.25    30.0
0.75    70.0
dtype: float64


## 离散化

In [19]:
import pandas as pd

# 生成示例数据
data = [10, 15, 20, 25, 30, 35, 40, 45, 50]

# 将数据转换为DataFrame格式
df = pd.DataFrame(data, columns=['value'])

# 使用cut函数进行等宽离散化
bins = 3  # 划分为3个区间
df['bin'] = pd.cut(df['value'], bins)

print(df)

   value               bin
0     10    (9.96, 23.333]
1     15    (9.96, 23.333]
2     20    (9.96, 23.333]
3     25  (23.333, 36.667]
4     30  (23.333, 36.667]
5     35  (23.333, 36.667]
6     40    (36.667, 50.0]
7     45    (36.667, 50.0]
8     50    (36.667, 50.0]


In [20]:
import pandas as pd

# 生成示例数据
data = [10, 15, 20, 25, 30, 35, 40, 45, 50]

# 将数据转换为DataFrame格式
df = pd.DataFrame(data, columns=['value'])

# 使用qcut函数进行等频离散化
bins = 3  # 划分为3个区间
df['bin'] = pd.qcut(df['value'], bins)

print(df)

   value               bin
0     10   (9.999, 23.333]
1     15   (9.999, 23.333]
2     20   (9.999, 23.333]
3     25  (23.333, 36.667]
4     30  (23.333, 36.667]
5     35  (23.333, 36.667]
6     40    (36.667, 50.0]
7     45    (36.667, 50.0]
8     50    (36.667, 50.0]


In [21]:
from sklearn.cluster import KMeans
import pandas as pd

# 生成示例数据
data = [10, 15, 20, 25, 30, 35, 40, 45, 50]

# 将数据转换为DataFrame格式
df = pd.DataFrame(data, columns=['value'])

# 使用K-means聚类算法进行离散化
k = 3  # 划分为3个簇
kmeans = KMeans(n_clusters=k).fit(df)
df['bin'] = kmeans.labels_

print(df)



   value  bin
0     10    1
1     15    1
2     20    1
3     25    2
4     30    2
5     35    2
6     40    0
7     45    0
8     50    0


In [22]:
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

# 生成示例数据
data = [10, 15, 20, 25, 30, 35, 40, 45, 50]

# 将数据转换为DataFrame格式
df = pd.DataFrame(data, columns=['value'])

# 使用决策树算法进行离散化
dt = DecisionTreeClassifier(max_leaf_nodes=3)
dt.fit(df, df['value'])
df['bin'] = dt.predict(df)

print(df)

   value  bin
0     10   10
1     15   15
2     20   20
3     25   20
4     30   20
5     35   20
6     40   20
7     45   20
8     50   20


## 归一化和标准化

In [23]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import numpy as np

# 示例数据
data = np.array([[1, 2, 3],
                 [4, 5, 6],
                 [7, 8, 9]])

# 最小-最大缩放
min_max_scaler = MinMaxScaler()
data_normalized = min_max_scaler.fit_transform(data)
print("Min-Max Scaling:")
print(data_normalized)

# Z-Score标准化
zscore_scaler = StandardScaler()
data_standardized = zscore_scaler.fit_transform(data)
print("Z-Score Standardization:")
print(data_standardized)

Min-Max Scaling:
[[0.  0.  0. ]
 [0.5 0.5 0.5]
 [1.  1.  1. ]]
Z-Score Standardization:
[[-1.22474487 -1.22474487 -1.22474487]
 [ 0.          0.          0.        ]
 [ 1.22474487  1.22474487  1.22474487]]


## 数值化

In [26]:
from sklearn.preprocessing import LabelEncoder

# 创建一个包含类别变量的列表
data = ['上', '中', '下', '上', '中']

# 创建LabelEncoder对象
encoder = LabelEncoder()

# 对数据进行数值化
encoded_data = encoder.fit_transform(data)

# 打印数值化后的结果
print(encoded_data)

[0 2 1 0 2]


In [28]:
from sklearn.preprocessing import OrdinalEncoder

# 创建一个包含类别变量的二维数组
data = [['上'], ['中'], ['下'], ['上'], ['中']]

# 创建OrdinalEncoder对象，并指定映射关系
encoder = OrdinalEncoder(categories=[['下', '中', '上']])

# 对数据进行数值化
encoded_data = encoder.fit_transform(data)

# 打印数值化后的结果
print(encoded_data)

[[2.]
 [1.]
 [0.]
 [2.]
 [1.]]


In [24]:
import pandas as pd

# 创建一个包含分类变量的DataFrame
data = pd.DataFrame({'颜色': ['红', '绿', '蓝', '红', '绿']})

# 使用pandas的get_dummies函数进行数值化独热编码
encoded_data = pd.get_dummies(data)

# 打印编码后的DataFrame
print(encoded_data)

   颜色_红  颜色_绿  颜色_蓝
0     1     0     0
1     0     1     0
2     0     0     1
3     1     0     0
4     0     1     0


In [25]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# 创建一个包含分类变量的数组
data = np.array([['红'], ['绿'], ['蓝'], ['红'], ['绿']])

# 创建OneHotEncoder对象
encoder = OneHotEncoder()

# 对数据进行数值化独热编码
encoded_data = encoder.fit_transform(data).toarray()

# 打印编码后的数组
print(encoded_data)

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]]


## 正规化

In [32]:
import numpy as np

# 输入特征 X
X = np.array([[1, 2],
              [3, 4],
              [5, 6]])

# 目标变量 y
y = np.array([3, 5, 7])

In [33]:
from sklearn.linear_model import Lasso

# 创建Lasso回归模型，alpha为正规化强度参数
lasso = Lasso(alpha=0.1)

# 训练模型
lasso.fit(X, y)

# 获取模型的权重向量
weights = lasso.coef_
weights

array([0.9625, 0.    ])

In [34]:
from sklearn.linear_model import Ridge

# 创建Ridge回归模型，alpha为正规化强度参数
ridge = Ridge(alpha=0.1)

# 训练模型
ridge.fit(X, y)

# 获取模型的权重向量
weights = ridge.coef_
weights

array([0.49689441, 0.49689441])

# LDA

In [35]:
import numpy as np

# 生成特征数据 X
np.random.seed(0)
n_samples = 100
n_features = 3
X = np.random.randn(n_samples, n_features)

# 生成目标变量 y
y = np.random.randint(0, 2, n_samples)

In [38]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# 创建LDA对象
lda = LinearDiscriminantAnalysis(n_components=1)

# 拟合LDA模型并进行降维
X_lda = lda.fit_transform(X, y)

# 查看降维后的数据
print("降维后的数据 X_lda:")
print(X_lda)

降维后的数据 X_lda:
[[ 0.30712767]
 [-1.87763148]
 [-0.34165333]
 [ 1.38174172]
 [ 0.25342184]
 [-0.26424203]
 [-2.37859223]
 [-0.88648083]
 [-0.70635822]
 [ 1.57898431]
 [-0.77648935]
 [ 1.22159466]
 [-0.82057978]
 [-1.00157649]
 [ 0.32075247]
 [ 1.18931637]
 [ 0.03666792]
 [-1.1088394 ]
 [ 0.22180063]
 [-0.27553083]
 [-0.29322706]
 [ 0.53925142]
 [-0.00773587]
 [ 0.22852864]
 [ 0.12619626]
 [-0.0314264 ]
 [-0.81756987]
 [-1.73799846]
 [ 0.53333929]
 [ 1.3308521 ]
 [ 0.47765716]
 [ 0.39697442]
 [ 0.18199072]
 [-1.41923935]
 [-0.45388424]
 [-1.38821821]
 [ 1.02907563]
 [ 1.66103186]
 [ 1.15504502]
 [ 1.09195184]
 [ 0.35653288]
 [-0.5211206 ]
 [ 0.90184561]
 [ 0.555468  ]
 [-0.14478596]
 [-0.36480747]
 [-1.37765123]
 [ 0.57784267]
 [-1.8236746 ]
 [-0.69187926]
 [-0.61903386]
 [-0.0961292 ]
 [-1.37141245]
 [ 2.25484174]
 [-1.4374277 ]
 [-1.63383949]
 [ 0.51204112]
 [-0.64947599]
 [-0.18522175]
 [ 0.52081809]
 [-0.21071899]
 [-0.42379178]
 [-0.0550401 ]
 [-0.17997499]
 [ 0.76214064]
 [ 0.994749