In [23]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns  # visualization tool
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


In [24]:
df_train = pd.read_csv(r"./train.csv")
df_no_na = df_train.dropna()
df = df_no_na[df_no_na['Expected'] <= 10000]
# List of column names to be deleted
columns_to_delete = ['Ref_5x5_10th', 'Ref_5x5_50th', 'Ref_5x5_90th', 'RefComposite', 'RefComposite_5x5_10th', 'RefComposite_5x5_50th', 'RefComposite_5x5_90th']

# Delete the columns from the dataframe
df = df.drop(columns=columns_to_delete)

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 目的変数を設定します。
y = df['Expected']
X = df.drop('Expected', axis=1)

# 主成分分析の結果を使用して訓練データとテストデータを分割します。
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# モデルのインスタンスを作成し、訓練データにフィットさせます。
model = LinearRegression()
model.fit(X_train, y_train)

# 訓練データとテストデータに対する予測を行います。
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)

# MSEを計算します。
train_mse = mean_squared_error(y_train, train_predictions)
test_mse = mean_squared_error(y_test, test_predictions)

# MAEを計算します。
train_mae = mean_absolute_error(y_train, train_predictions)
test_mae = mean_absolute_error(y_test, test_predictions)

# R^2 scoreを計算します。
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

# 結果を表示します。
print(f'Training MSE: {train_mse}')
print(f'Test MSE: {test_mse}')
print(f'Training MAE: {train_mae}')
print(f'Test MAE: {test_mae}')
print(f'Training R^2 Score: {train_r2}')
print(f'Test R^2 Score: {test_r2}')

Training MSE: 16320.417029053106
Test MSE: 16573.895267310178
Training MAE: 17.473967454858336
Test MAE: 17.45842498675406
Training R^2 Score: 0.0007322345912909034
Test R^2 Score: 0.000868145901332662


In [26]:
X_train.shape

(2215248, 16)

In [27]:
from sklearn.feature_selection import RFE

# RFEのインスタンスを作成
selector = RFE(model, n_features_to_select=16, step=1)

# 特徴量の重要度を学習
selector = selector.fit(X_train, y_train)


In [28]:
# RFEによって選ばれた特徴量を使って新しいデータセットを作成
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# モデルを再度訓練
model.fit(X_train_selected, y_train)

# 新しい特徴量セットで予測
train_predictions_selected = model.predict(X_train_selected)
test_predictions_selected = model.predict(X_test_selected)

# 新しい特徴量セットでの評価指標を計算
train_mse_selected = mean_squared_error(y_train, train_predictions_selected)
test_mse_selected = mean_squared_error(y_test, test_predictions_selected)
train_mae_selected = mean_absolute_error(y_train, train_predictions_selected)
test_mae_selected = mean_absolute_error(y_test, test_predictions_selected)
train_r2_selected = r2_score(y_train, train_predictions_selected)
test_r2_selected = r2_score(y_test, test_predictions_selected)

# 新しい評価指標を表示
print(f'Training MSE with selected features: {train_mse_selected}')
print(f'Test MSE with selected features: {test_mse_selected}')
print(f'Training MAE with selected features: {train_mae_selected}')
print(f'Test MAE with selected features: {test_mae_selected}')
print(f'Training R^2 Score with selected features: {train_r2_selected}')
print(f'Test R^2 Score with selected features: {test_r2_selected}')

Training MSE with selected features: 16320.417029053106
Test MSE with selected features: 16573.895267310178
Training MAE with selected features: 17.473967454858336
Test MAE with selected features: 17.45842498675406
Training R^2 Score with selected features: 0.0007322345912909034
Test R^2 Score with selected features: 0.000868145901332662


In [29]:
selected_features = X.columns[selector.support_]

# モデルの係数を取得
coefficients = model.coef_

# 選択された特徴量とそれに対応する係数を表示
for feature, coef in zip(selected_features, coefficients):
    print(f'Feature: {feature}, Coefficient: {coef}')

Feature: Id, Coefficient: 1.2922095326476143e-07
Feature: minutes_past, Coefficient: 0.017834875628732583
Feature: radardist_km, Coefficient: 0.9055108628367069
Feature: Ref, Coefficient: 0.0741735846741823
Feature: RhoHV, Coefficient: -2.9170833103612868
Feature: RhoHV_5x5_10th, Coefficient: 7.158013526092422
Feature: RhoHV_5x5_50th, Coefficient: -54.85303866259386
Feature: RhoHV_5x5_90th, Coefficient: 53.712266107704174
Feature: Zdr, Coefficient: 0.04133663108341599
Feature: Zdr_5x5_10th, Coefficient: 0.12713132345821418
Feature: Zdr_5x5_50th, Coefficient: 1.0138143258567778
Feature: Zdr_5x5_90th, Coefficient: -0.003941269830468233
Feature: Kdp, Coefficient: -0.018597500685223624
Feature: Kdp_5x5_10th, Coefficient: 0.1316399791171795
Feature: Kdp_5x5_50th, Coefficient: -0.04610166229067562
Feature: Kdp_5x5_90th, Coefficient: -0.09902616241578935
