# 総合添削問題

総合添削問題ではコンクリートのデータを用いて複雑な回帰分析に挑戦していただきます。

#### 問題

- コンクリートの圧縮強度について回帰分析を行いたいデータが渡されます。
- 回帰分析を行い、最もよい決定係数を算出したモデルとその値を出力してください。
- また、concrete_train_X, concrete_test_Xに関しては処理を加えても構いません。

In [224]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import model_selection
# 必要なモジュールを追記してください。
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet

import time 

def plot_data(concrete_data):
        
    predict_target_column = 'Concrete compressive strength(MPa, megapascals) '

    concrete_data.plot.scatter(x='Cement (component 1)(kg in a m^3 mixture)', y=predict_target_column, c='Green')
    concrete_data.plot.scatter(x='Blast Furnace Slag (component 2)(kg in a m^3 mixture)', y=predict_target_column, c='Green')
    concrete_data.plot.scatter(x='Fly Ash (component 3)(kg in a m^3 mixture)', y=predict_target_column, c='Green')
    concrete_data.plot.scatter(x='Water  (component 4)(kg in a m^3 mixture)', y=predict_target_column, c='Green')
    concrete_data.plot.scatter(x='Superplasticizer (component 5)(kg in a m^3 mixture)', y=predict_target_column, c='Green')
    concrete_data.plot.scatter(x='Coarse Aggregate  (component 6)(kg in a m^3 mixture)', y=predict_target_column, c='Green')
    concrete_data.plot.scatter(x='Fine Aggregate (component 7)(kg in a m^3 mixture)', y=predict_target_column, c='Green')
    concrete_data.plot.scatter(x='Age (day)', y=predict_target_column, c='Green')

def save_predict_data(model_name, test_data, answer_correct, answer_predict):
    data = test_data.copy()
    data['Strength(MPa) '] = answer_correct
    data['Strength(MPa)  - Predict'] = pd.Series(answer_predict, index=answer_correct.index)
    
    predict_data_path = 'Concrete_Data_%s.xlsx' % (model_name)
    writer = pd.ExcelWriter(predict_data_path)
    data.to_excel(writer,'Sheet1')
    writer.save()
    return predict_data_path

def make_analyze_response(model_name, concrete_test_X, concrete_test_y, result, score):
    predict_data_path = save_predict_data(model_name, concrete_test_X, concrete_test_y, result)
    print("[%s] score: %s, predict: %s" % (model_name, score, predict_data_path))
    
    return {
        "model": model_name,
        "score":  score,
        "predict_data_path": predict_data_path
    }

def analyze_Lasso(concrete_train_X, concrete_test_X, concrete_train_y, concrete_test_y):
    model = Lasso()

    model.fit(concrete_train_X, concrete_train_y)
    predict = model.predict(concrete_test_X)
    score = model.score(concrete_test_X, concrete_test_y)

    return make_analyze_response("Lasso", concrete_test_X, concrete_test_y, predict, score)

def analyze_Ridge(concrete_train_X, concrete_test_X, concrete_train_y, concrete_test_y):
    model = Ridge()

    model.fit(concrete_train_X, concrete_train_y)
    predict = model.predict(concrete_test_X)
    score = model.score(concrete_test_X, concrete_test_y)

    return make_analyze_response("Ridge", concrete_test_X, concrete_test_y, predict, score)

def analyze_LinearRegression(concrete_train_X, concrete_test_X, concrete_train_y, concrete_test_y):
    model = LinearRegression(normalize=True)

    model.fit(concrete_train_X, concrete_train_y)
    
    predict = model.predict(concrete_test_X)
    score = model.score(concrete_test_X, concrete_test_y)

    return make_analyze_response("LinearRegression", concrete_test_X, concrete_test_y, predict, score)

def analyze_ElasticNet(l1_ratio, concrete_train_X, concrete_test_X, concrete_train_y, concrete_test_y):
    model = ElasticNet(l1_ratio=l1_ratio)

    model.fit(concrete_train_X, concrete_train_y)
    
    predict = model.predict(concrete_test_X)
    score = model.score(concrete_test_X, concrete_test_y)

    return make_analyze_response("ElasticNet L1_RATIO_%s" % (l1_ratio), concrete_test_X, concrete_test_y, predict, score)


def cross_validate_LinearRegression(X, y):
    test_model = LinearRegression(normalize=True)
    scores = model_selection.cross_val_score(test_model, X, y, cv=10)
    print (scores)
    print ("平均スコア :", scores.mean())

def cross_validate_Lasso(X, y):
    test_model = Lasso()
    scores = model_selection.cross_val_score(test_model, X, y, cv=10)
    print (scores)
    print ("平均スコア :", scores.mean())


# データの読み込み
concrete_data = pd.read_excel("https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls")

# データ可視化
# plot_data(concrete_data)

X = concrete_data.drop(['Concrete compressive strength(MPa, megapascals) ', ], axis=1)
y = concrete_data['Concrete compressive strength(MPa, megapascals) ']
concrete_train_X, concrete_test_X, concrete_train_y, concrete_test_y = train_test_split(
    X,
    y,
    random_state=int(time.time()))

# 交差検証
cross_validate_LinearRegression(X, y)
cross_validate_Lasso(X, y)

# 以下にコードを記述してください。

results = []

## 解析 - 線形重回帰
results.append(analyze_LinearRegression(concrete_train_X, concrete_test_X, concrete_train_y, concrete_test_y))
results.append(analyze_Lasso(concrete_train_X, concrete_test_X, concrete_train_y, concrete_test_y))
results.append(analyze_Ridge(concrete_train_X, concrete_test_X, concrete_train_y, concrete_test_y))
# Lidge + Lasso ratio
results.append(analyze_ElasticNet(0.1, concrete_train_X, concrete_test_X, concrete_train_y, concrete_test_y))
results.append(analyze_ElasticNet(0.3, concrete_train_X, concrete_test_X, concrete_train_y, concrete_test_y))
results.append(analyze_ElasticNet(0.5, concrete_train_X, concrete_test_X, concrete_train_y, concrete_test_y))
results.append(analyze_ElasticNet(0.7, concrete_train_X, concrete_test_X, concrete_train_y, concrete_test_y))
results.append(analyze_ElasticNet(0.9, concrete_train_X, concrete_test_X, concrete_train_y, concrete_test_y))

# 一番いい結果を出力
df = pd.DataFrame.from_records(results, columns=["score", "model", "predict_data_path"])
best = df.loc[ df['score'].idxmax()]
print ("一番いい結果: %s" % (best))

[-1.23720016  0.5274709   0.35247578  0.32762641  0.3125442   0.22064086
  0.48962716  0.65231292  0.57869199  0.55656575]
平均スコア : 0.27807558197878
[-1.19294362  0.505827    0.33808585  0.32456037  0.31500926  0.20544051
  0.48126547  0.65251845  0.58485267  0.55869605]
平均スコア : 0.2773312008493888
[LinearRegression] score: 0.6536634398308467, predict: Concrete_Data_LinearRegression.xlsx
[Lasso] score: 0.653842055545407, predict: Concrete_Data_Lasso.xlsx
[Ridge] score: 0.6536637307569726, predict: Concrete_Data_Ridge.xlsx
[ElasticNet L1_RATIO_0.1] score: 0.6538543368787346, predict: Concrete_Data_ElasticNet L1_RATIO_0.1.xlsx
[ElasticNet L1_RATIO_0.3] score: 0.6538674731928495, predict: Concrete_Data_ElasticNet L1_RATIO_0.3.xlsx
[ElasticNet L1_RATIO_0.5] score: 0.6538722484736974, predict: Concrete_Data_ElasticNet L1_RATIO_0.5.xlsx
[ElasticNet L1_RATIO_0.7] score: 0.6538679324162372, predict: Concrete_Data_ElasticNet L1_RATIO_0.7.xlsx
[ElasticNet L1_RATIO_0.9] score: 0.6538533746484565, p

#### ヒント

- 必要なモジュールは追記してください。
- データの処理に関しては行わなくても問題ありません。

##  解答例

添削課題の提出は以下のアドレスから提出いただきますようお願いします。<br>

https://goo.gl/forms/fW7CAspZMwHuWuqk2<br><br>
以下のアドレスからアンケートにご協力頂きたく存じます。<br>
ご回答のほど、よろしくお願いいたします。

https://goo.gl/forms/WHjJQYeodIndRvyz2