In [70]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
import csv

In [None]:
column_names = ["battery_power", "blue", "clock_speed", "dual_sim", "fc", "four_g", "int_memory", "m_dep", "mobile_wt", "n_cores", "pc","px_height","px_width","ram","sc_h","sc_w","talk_time","three_g","touch_screen","wifi"]

# KNNImputer
n_neighbors = 5

数据输入预处理

In [71]:
train_DataFrame = pd.read_csv(filepath_or_buffer="Data/MobilePrice/train.csv", 
                    names=["battery_power", "blue", "clock_speed", "dual_sim", "fc", "four_g", "int_memory", "m_dep", "mobile_wt", "n_cores", "pc","px_height","px_width","ram","sc_h","sc_w","talk_time","three_g","touch_screen","wifi","price_range"], skiprows=1)
test_DataFrame = pd.read_csv(filepath_or_buffer="Data/MobilePrice/test.csv", 
                    names=["battery_power", "blue", "clock_speed", "dual_sim", "fc", "four_g", "int_memory", "m_dep", "mobile_wt", "n_cores", "pc","px_height","px_width","ram","sc_h","sc_w","talk_time","three_g","touch_screen","wifi"], skiprows=1)
test_AnswerDataFrame = pd.read_csv(filepath_or_buffer="Data/MobilePrice/gender_submission.csv",
                    names=["id", "price_range"], skiprows=1)

print(train_DataFrame)
print(test_DataFrame)
print(test_AnswerDataFrame)

      battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  \
0               842     0          2.2         0   1       0           7   
1              1021     1          0.5         1   0       1          53   
2               563     1          0.5         1   2       1          41   
3               615     1          2.5         0   0       0          10   
4              1821     1          1.2         0  13       1          44   
...             ...   ...          ...       ...  ..     ...         ...   
1595           1206     0          3.0         1  10       1          30   
1596            832     1          0.5         1   1       1           5   
1597            848     1          2.8         0   1       0           8   
1598           1851     0          2.9         0   0       0          53   
1599           1166     1          0.5         0   7       0          59   

      m_dep  mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  \
0       

In [72]:
def EncodeData(x: pd.DataFrame) -> list:
    """
    对特征向量进行编码
    将pandas的DataFrame结构转换为list
    @x: pandas.DataFrame
    @return: 特征向量
    """
    input_data = [[x["battery_power"][i], x["blue"][i], x["clock_speed"][i], x["dual_sim"][i], x["fc"][i], x["four_g"][i], x["int_memory"][i], x["m_dep"][i], x["mobile_wt"][i], x["n_cores"][i], x["pc"][i], x["px_height"][i], x["px_width"][i], x["ram"][i], x["sc_h"][i], x["sc_w"][i], x["talk_time"][i], x["three_g"][i], x["touch_screen"][i], x["wifi"][i]] for i in range(len(x))]
    return input_data
    

def EncodeTrainData(x: pd.DataFrame) -> tuple[list, list]:
    """
    对测试集进行编码处理
    将pandas的DataFrame结构转换为list
    @x: pandas.DataFrame
    @return: 特征向量，标签向量
    """
    input_data = EncodeData(x)
    label = [x["price_range"][i] for i in range(len(x))]

    return input_data, label

def EncodeTestData(x: pd.DataFrame) -> list:
    """
    对训练集进行编码处理
    将pandas的DataFrame结构转换为list
    @x: pandas.DataFrame
    @return: 特征向量
    """
    input_data = EncodeData(x)
    
    return input_data

def EncodeTestAnswer(x: pd.DataFrame) -> list:
    """
    对训练结果进行编码处理
    将pandas的DataFrame结构转换为list
    @x: pandas.DataFrame
    @return: 标签向量
    """
    label = [x["price_range"][i] for i in range(len(x))]
    
    return label


In [None]:
# battery_power", "blue", "clock_speed", "dual_sim", "fc", "four_g", "int_memory", "m_dep", "mobile_wt", "n_cores", "pc","px_height","px_width","ram","sc_h","sc_w","talk_time","three_g","touch_screen","wifi"
class MinMaxScalerStrategy:
    def __init__(self, data: pd.DataFrame, filepath: str):
        self.data = data
        self.min = {
            "battery_power": data["battery_power"].min(),
            "blue": data["blue"].min(),
            "clock_speed": data["clock_speed"].min(),
            "dual_sim": data["dual_sim"].min(),
            "fc": data["fc"].min(),
            "four_g": data["four_g"].min(),
            "int_memory": data["int_memory"].min(),
            "m_dep": data["m_dep"].min(),
            "mobile_wt": data["mobile_wt"].min(),
        }
        self.max = {
            "Pclass": data["Pclass"].max(),
            "Sex": data["Sex"].max(),
            "Age": data["Age"].max(),
            "SibSp": data["SibSp"].max(),
            "Parch": data["Parch"].max(),
            "Fare": data["Fare"].max(),
            "Embarked": data["Embarked"].max()
        }
        colunm_names = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
        def inner_transform(data: pd.DataFrame) -> pd.DataFrame:
            for name in colunm_names:
                for i in range(len(data[name])):
                    data.loc[i, name] = (data[name][i] - self.min[name]) / (self.max[name] - self.min[name])
            return data
        self.data = inner_transform(self.data)
        # 存储MinMax相关数据
        save_data = []
        save_data.append(colunm_names)
        max_array = []
        min_array = []
        for name in colunm_names:
            max_array.append(self.max[name])
            min_array.append(self.min[name])
        save_data.append(max_array)
        save_data.append(min_array)
        with open(filepath, "w", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerows(save_data)


    def transform(self, data: pd.DataFrame) -> pd.DataFrame:
        colunm_names = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
        for name in colunm_names:
            for i in range(len(data[name])):
                data.loc[i, name] = (data[name][i] - self.min[name]) / (self.max[name] - self.min[name])
        return data

In [73]:
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.preprocessing import LabelEncoder
# 
# train_x, train_y = EncodeTrainData(train_DataFrame)
# test_x = EncodeTestData(test_DataFrame)
# test_y = EncodeTestAnswer(test_AnswerDataFrame)
# 
# # 初始化MinMaxScaler
# scaler = MinMaxScaler()
# # 对特征向量进行归一化处理
# train_x = scaler.fit_transform(train_x)  # 这里 train_x 必须是一个 DataFrame
# 
# # 对标签向量进行编码（例如，使用 LabelEncoder）
# le = LabelEncoder()
# train_y = le.fit_transform(train_y)
# 
# print(train_x)
# print(train_y)

[[0.22778891 0.         0.68       ... 0.         0.         1.        ]
 [0.34736139 1.         0.         ... 1.         1.         0.        ]
 [0.04141617 1.         0.         ... 1.         1.         0.        ]
 ...
 [0.23179693 1.         0.92       ... 1.         1.         1.        ]
 [0.90180361 0.         0.96       ... 1.         0.         0.        ]
 [0.44422178 1.         0.         ... 1.         0.         1.        ]]
[1 2 2 ... 0 2 3]


随机森林训练

In [79]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_classifier = RandomForestClassifier(n_estimators=100)  
#随机森林中决策树的数量设置为100
rf_classifier.fit(train_x, train_y)

pred_y = rf_classifier.predict(test_x)

accuracy = accuracy_score(test_y, pred_y)
print(f'Model Accuracy: {accuracy:.4f}')

Model Accuracy: 0.2475


In [80]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5)
grid_search.fit(train_x, train_y)
best_rf_classifier = grid_search.best_estimator_

In [76]:
y_pred_best = best_rf_classifier.predict(test_x)
best_accuracy = accuracy_score(test_y, y_pred_best)
print(f'Best Model Accuracy: {best_accuracy:.4f}')

Best Model Accuracy: 0.2475


SVM训练

In [77]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import pandas as pd

# 初始化SVM分类器
svm_classifier = SVC(kernel='linear')

# 训练模型
svm_classifier.fit(train_x, train_y)

# 进行预测
y_pred = svm_classifier.predict(test_x)

# 评估模型
accuracy = accuracy_score(test_y, y_pred)
print(f'Model Accuracy: {accuracy:.4f}')

# 参数调优（可选）
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf', 'poly']
}

grid_search = GridSearchCV(estimator=svm_classifier, param_grid=param_grid, cv=5)
grid_search.fit(train_x, train_y)
best_svm_classifier = grid_search.best_estimator_

# 使用最佳参数的模型进行预测
y_pred_best = best_svm_classifier.predict(test_x)
best_accuracy = accuracy_score(test_y, y_pred_best)
print(f'Best Model Accuracy: {best_accuracy:.4f}')

Model Accuracy: 0.2475
Best Model Accuracy: 0.2475


MLP训练

In [78]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# 初始化MLP分类器
mlp_classifier = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)

# 训练模型
mlp_classifier.fit(train_x, train_y)

# 进行预测
y_pred = mlp_classifier.predict(test_x)

# 评估模型
accuracy = accuracy_score(test_y, y_pred)
print(f'Model Accuracy: {accuracy:.4f}')

# 参数调优（可选）
from sklearn.model_selection import GridSearchCV

param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'max_iter': [1000, 2000]
}

grid_search = GridSearchCV(estimator=mlp_classifier, param_grid=param_grid, cv=5)
grid_search.fit(train_x, train_y)
best_mlp_classifier = grid_search.best_estimator_

# 使用最佳参数的模型进行预测
y_pred_best = best_mlp_classifier.predict(test_x)
best_accuracy = accuracy_score(test_y, y_pred_best)
print(f'Best Model Accuracy: {best_accuracy:.4f}')

Model Accuracy: 0.2475




Best Model Accuracy: 0.25


