In [None]:
# k_means.py
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt


# 读取csv格式数据集到变量df(这里你要改读取格式)
df = pd.read_csv("/hadoop/expt/3/DelayedFlights.csv", index_col=0)
# 输出df的行名与列名，以验证是否读取正确
df.index
df.columns
# 数据准备
data_X = df[['Month', 'DayofMonth', 'DayOfWeek',
             'UniqueCarrier', 'Origin', 'Dest', 'Distance']]

# 处理分类特征：将字符串编码为数值
label_encoders = {}
categorical_columns = ['UniqueCarrier', 'Origin', 'Dest']
for col in categorical_columns:
    le = LabelEncoder()
    data_X[col] = le.fit_transform(data_X[col])
    label_encoders[col] = le

# 数据标准化
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_X)

# 使用K-means进行聚类
k = 2  # 假设聚类为2类（Cancelled和未Cancelled）
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(data_scaled)

# 将聚类结果添加回原数据
df['Cluster'] = clusters

# 查看聚类结果分布
print("聚类结果统计：")
print(df.groupby('Cluster')['Cancelled'].value_counts())

# 计算轮廓系数（评估聚类质量）
silhouette_avg = silhouette_score(data_scaled, clusters)
print(f"Silhouette Score: {silhouette_avg:.2f}")

# 可视化聚类中心
plt.figure(figsize=(8, 6))
plt.scatter(data_scaled[:, 0], data_scaled[:, 1],
            c=clusters, cmap='viridis', s=10)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[
            :, 1], c='red', marker='x', s=200)
plt.title("K-means Clustering Results")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()

In [None]:

# random_forest.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# 读取csv格式数据集到变量df(这里你要改读取格式)
df = pd.read_csv("/hadoop/expt/3/DelayedFlights.csv", index_col=0)
# 输出df的行名与列名，以验证是否读取正确
df.index
df.columns

# 数据准备
data_X = df[['Month', 'DayofMonth', 'DayOfWeek',
             'UniqueCarrier', 'Origin', 'Dest', 'Distance']]
data_Y = df['Cancelled']

# 处理分类特征：将字符串编码为数值
label_encoders = {}
categorical_columns = ['UniqueCarrier', 'Origin', 'Dest']
for col in categorical_columns:
    le = LabelEncoder()
    data_X[col] = le.fit_transform(data_X[col])
    label_encoders[col] = le

# 数据集划分
X_train, X_test, Y_train, Y_test = train_test_split(
    data_X, data_Y, test_size=0.2, random_state=42)

# 构建随机森林分类器
model = RandomForestClassifier(n_estimators=100, random_state=42)

# 训练模型
model.fit(X_train, Y_train)

# 预测结果
Y_pred = model.predict(X_test)

# 评估模型
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Test Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(Y_test, Y_pred))
print("Confusion Matrix:")
print(confusion_matrix(Y_test, Y_pred))

In [None]:
# lstm.py


import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense


# 读取csv格式数据集到变量df(这里你要改读取格式)
df = pd.read_csv("/hadoop/expt/3/DelayedFlights.csv", index_col=0)
# 输出df的行名与列名，以验证是否读取正确
df.index
df.columns

# 数据准备
data_X = df[['Month', 'DayofMonth', 'DayOfWeek',
             'UniqueCarrier', 'Origin', 'Dest', 'Distance']]
data_Y = df['Cancelled']

# 数据归一化
scaler = MinMaxScaler()
data_X_scaled = scaler.fit_transform(data_X)

# 数据重塑
time_steps = 1  # 假设每个时间步是1
X = np.reshape(
    data_X_scaled, (data_X_scaled.shape[0], time_steps, data_X_scaled.shape[1]))

# 数据集划分
X_train, X_test, Y_train, Y_test = train_test_split(
    X, data_Y, test_size=0.2, random_state=42)

# 构建LSTM模型
model = Sequential()
model.add(LSTM(50, return_sequences=False,
          input_shape=(time_steps, X.shape[2])))
model.add(Dense(1, activation='sigmoid'))  # 用于分类任务
model.compile(optimizer='adam', loss='binary_crossentropy',
              metrics=['accuracy'])

# 训练模型
model.fit(X_train, Y_train, epochs=10, batch_size=32,
          validation_data=(X_test, Y_test))

# 评估模型
loss, accuracy = model.evaluate(X_test, Y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")