In [27]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns  # visualization tool
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


In [28]:
df_train = pd.read_csv(r"./train.csv")
df_no_na = df_train.dropna()
df = df_no_na[df_no_na['Expected'] <= 10000]
# List of column names to be deleted
columns_to_delete = ['Id', 'Ref_5x5_10th', 'Ref_5x5_50th', 'Ref_5x5_90th', 'RefComposite', 'RefComposite_5x5_10th', 'RefComposite_5x5_50th', 'RefComposite_5x5_90th', 'RhoHV_5x5_90th']

# Delete the columns from the dataframe
df = df.drop(columns=columns_to_delete)

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 目的変数を設定します。
y = df['Expected']
X = df.drop('Expected', axis=1)

# 主成分分析の結果を使用して訓練データとテストデータを分割します。
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# モデルのインスタンスを作成し、訓練データにフィットさせます。
model = LinearRegression()
model.fit(X_train, y_train)

# 訓練データとテストデータに対する予測を行います。
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)

# MSEを計算します。
train_mse = mean_squared_error(y_train, train_predictions)
test_mse = mean_squared_error(y_test, test_predictions)

# MAEを計算します。
train_mae = mean_absolute_error(y_train, train_predictions)
test_mae = mean_absolute_error(y_test, test_predictions)

# R^2 scoreを計算します。
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

# 結果を表示します。
print(f'Training MSE: {train_mse}')
print(f'Test MSE: {test_mse}')
print(f'Training MAE: {train_mae}')
print(f'Test MAE: {test_mae}')
print(f'Training R^2 Score: {train_r2}')
print(f'Test R^2 Score: {test_r2}')

Training MSE: 16320.418944172518
Test MSE: 16573.899792184835
Training MAE: 17.474237674520538
Test MAE: 17.45864279353661
Training R^2 Score: 0.0007321173322103469
Test R^2 Score: 0.0008678731262042882


In [30]:
X_train.shape

(2215248, 15)

In [31]:
from sklearn.feature_selection import RFE

# RFEのインスタンスを作成
selector = RFE(model, n_features_to_select=16, step=1)

# 特徴量の重要度を学習
selector = selector.fit(X_train, y_train)


In [32]:
# RFEによって選ばれた特徴量を使って新しいデータセットを作成
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# モデルを再度訓練
model.fit(X_train_selected, y_train)

# 新しい特徴量セットで予測
train_predictions_selected = model.predict(X_train_selected)
test_predictions_selected = model.predict(X_test_selected)

# 新しい特徴量セットでの評価指標を計算
train_mse_selected = mean_squared_error(y_train, train_predictions_selected)
test_mse_selected = mean_squared_error(y_test, test_predictions_selected)
train_mae_selected = mean_absolute_error(y_train, train_predictions_selected)
test_mae_selected = mean_absolute_error(y_test, test_predictions_selected)
train_r2_selected = r2_score(y_train, train_predictions_selected)
test_r2_selected = r2_score(y_test, test_predictions_selected)

# 新しい評価指標を表示
print(f'Training MSE with selected features: {train_mse_selected}')
print(f'Test MSE with selected features: {test_mse_selected}')
print(f'Training MAE with selected features: {train_mae_selected}')
print(f'Test MAE with selected features: {test_mae_selected}')
print(f'Training R^2 Score with selected features: {train_r2_selected}')
print(f'Test R^2 Score with selected features: {test_r2_selected}')

Training MSE with selected features: 16320.418944172518
Test MSE with selected features: 16573.899792184835
Training MAE with selected features: 17.474237674520538
Test MAE with selected features: 17.45864279353661
Training R^2 Score with selected features: 0.0007321173322103469
Test R^2 Score with selected features: 0.0008678731262042882


In [33]:
selected_features = X.columns[selector.support_]

# モデルの係数を取得
coefficients = model.coef_

# 選択された特徴量とそれに対応する係数を表示
for feature, coef in zip(selected_features, coefficients):
    print(f'Feature: {feature}, Coefficient: {coef}')

Feature: minutes_past, Coefficient: 0.01782140264236856
Feature: radardist_km, Coefficient: 0.905407730881849
Feature: Ref, Coefficient: 0.07414852990432486
Feature: RhoHV, Coefficient: -2.917024214380893
Feature: RhoHV_5x5_10th, Coefficient: 7.162336339981556
Feature: RhoHV_5x5_50th, Coefficient: -54.86137040822909
Feature: RhoHV_5x5_90th, Coefficient: 53.732816432130576
Feature: Zdr, Coefficient: 0.041378139064590286
Feature: Zdr_5x5_10th, Coefficient: 0.1264684584189271
Feature: Zdr_5x5_50th, Coefficient: 1.0127626445427804
Feature: Zdr_5x5_90th, Coefficient: -0.003358432496123699
Feature: Kdp, Coefficient: -0.018598673903610856
Feature: Kdp_5x5_10th, Coefficient: 0.13158327030543776
Feature: Kdp_5x5_50th, Coefficient: -0.04616333113146155
Feature: Kdp_5x5_90th, Coefficient: -0.09896850184761433


In [34]:
df.corr()

Unnamed: 0,minutes_past,radardist_km,Ref,RhoHV,RhoHV_5x5_10th,RhoHV_5x5_50th,RhoHV_5x5_90th,Zdr,Zdr_5x5_10th,Zdr_5x5_50th,Zdr_5x5_90th,Kdp,Kdp_5x5_10th,Kdp_5x5_50th,Kdp_5x5_90th,Expected
minutes_past,1.0,-0.003004,0.006341,-0.002244,-0.001666,-0.003773,-0.004466,0.001558,0.000498,0.003021,0.003234,-0.001472,-0.000809,0.000447,0.001199,0.002657
radardist_km,-0.003004,1.0,-0.010118,0.078661,0.032685,0.117436,0.425581,-0.174553,-0.411562,-0.356342,-0.006422,-0.002495,-0.22734,-0.010566,0.231098,0.026009
Ref,0.006341,-0.010118,1.0,-0.076982,-0.055419,-0.166122,-0.209999,0.088927,0.116609,0.170736,0.106174,0.006773,0.011111,0.017545,0.019825,0.004815
RhoHV,-0.002244,0.078661,-0.076982,1.0,0.517925,0.545843,0.330661,-0.048613,0.134373,-0.125606,-0.278093,-0.003305,0.195792,-0.004197,-0.239056,0.000804
RhoHV_5x5_10th,-0.001666,0.032685,-0.055419,0.517925,1.0,0.67792,0.247093,-0.059533,0.386276,-0.130338,-0.503752,0.002348,0.453323,0.014513,-0.484278,0.001579
RhoHV_5x5_50th,-0.003773,0.117436,-0.166122,0.545843,0.67792,1.0,0.649869,-0.110738,0.114063,-0.221763,-0.3917,-0.004232,0.241462,-0.013945,-0.316323,0.001535
RhoHV_5x5_90th,-0.004466,0.425581,-0.209999,0.330661,0.247093,0.649869,1.0,-0.142003,-0.291233,-0.281736,-0.065192,-0.007038,-0.162593,-0.036899,0.083574,0.011728
Zdr,0.001558,-0.174553,0.088927,-0.048613,-0.059533,-0.110738,-0.142003,1.0,0.349669,0.512133,0.349557,0.007576,-0.011379,0.015531,0.029501,-0.001972
Zdr_5x5_10th,0.000498,-0.411562,0.116609,0.134373,0.386276,0.114063,-0.291233,0.349669,1.0,0.604495,-0.004872,0.009556,0.424009,0.038965,-0.406572,-0.007458
Zdr_5x5_50th,0.003021,-0.356342,0.170736,-0.125606,-0.130338,-0.221763,-0.281736,0.512133,0.604495,1.0,0.599295,0.012165,-0.00863,0.029915,0.045799,-0.004695
