<a href="https://colab.research.google.com/github/DBM1001/DSC2024/blob/main/DSC2024_ipynb_11_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install japanize_matplotlib
!pip install optuna
!pip install scikit-learn
!pip install nltk
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting japanize_matplotlib
  Downloading japanize-matplotlib-1.1.3.tar.gz (4.1 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/4.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/4.1 MB[0m [31m7.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━[0m [32m2.3/4.1 MB[0m [31m34.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.1/4.1 MB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: japanize_matplotlib
  Building wheel for japanize_matplotlib (setup.py) ... [?25l[?25hdone
  Created wheel for japanize_matplotlib: filename=japanize_matplotlib-1.1.3-py3-none-any.whl size=4120257 sha256=91f54da76188550ca813fa12fbfa5d65c2ea03d8b5fcb3bfd98b9fcefab675b3
  Stored in directory: /root/.cache/pi

In [None]:
import pandas as pd               # データを表のように扱うライブラリ
import numpy as np                # 数値計算を速くするライブラリ
import seaborn as sns             # きれいなグラフを簡単に作るライブラリ
import matplotlib.pyplot as plt   # グラフを作る基本的なライブラリ
%matplotlib inline
import japanize_matplotlib        # グラフに日本語を表示するライブラリ
japanize_matplotlib.japanize()

from sklearn.model_selection import train_test_split  # データを訓練用と検証用に分ける
from sklearn.metrics import mean_squared_error # 評価の計算を行うライブラリ

import lightgbm as lgb # 予測モデルに関するライブラリ

import warnings
warnings.simplefilter('ignore')  # 不要な警告を表示しない

In [None]:
# 予測モデルを訓練するためのデータセット
train = pd.read_csv('/content/drive/MyDrive/data/train.csv', index_col=0)

# 予測モデルに推論（予測)させるデータセット
test = pd.read_csv('/content/drive/MyDrive/data/test.csv', index_col=0)

In [None]:
#①文字数
def add_review_length_features(train_df, test_df):
    # Positive_Reviewの文字数を計算して新しいカラムを追加
    train_df['Positive_Review_Length'] = train_df['Positive_Review'].str.len()
    test_df['Positive_Review_Length'] = test_df['Positive_Review'].str.len()

    # Negative_Reviewの文字数を計算して新しいカラムを追加
    train_df['Negative_Review_Length'] = train_df['Negative_Review'].str.len()
    test_df['Negative_Review_Length'] = test_df['Negative_Review'].str.len()

    return train_df, test_df

# 特徴量を追加する
train, test = add_review_length_features(train, test)

# 結果の確認
print(train[['Positive_Review_Length', 'Negative_Review_Length']].head())
print(test[['Positive_Review_Length', 'Negative_Review_Length']].head())

   Positive_Review_Length  Negative_Review_Length
0                     105                       8
1                     311                       4
2                      17                      27
3                      57                      19
4                      43                      23
        Positive_Review_Length  Negative_Review_Length
283366                      10                      19
283367                      63                      88
283368                      61                       9
283369                     110                      37
283370                     104                      53


In [None]:
#①−２ Negative_Review_Lengthの逆相関を逆にする（反転）
train['Negative_Review_Length_reversed'] = train['Negative_Review_Length'].max() - train['Negative_Review_Length']
train.drop(columns=['Negative_Review_Length'], inplace=True)
test['Negative_Review_Length_reversed'] = test['Negative_Review_Length'].max() - test['Negative_Review_Length']
test.drop(columns=['Negative_Review_Length'], inplace=True)

In [None]:
#② Positive_ReviewのWord2Vec特徴量
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split

def preprocess_text(text):
    return text.lower().split()

# Positive_Review列の前処理
train['Positive_Review'].fillna("", inplace=True)
test['Positive_Review'].fillna("", inplace=True)

# Positive_Reviewをトークン化
train_reviews = train['Positive_Review'].apply(preprocess_text)
test_reviews = test['Positive_Review'].apply(preprocess_text)

# Word2Vecモデルの訓練
all_reviews = train_reviews.tolist() + test_reviews.tolist()  # 訓練とテストのレビューを結合
model = Word2Vec(sentences=all_reviews, vector_size=600, window=5, min_count=2, workers=4, epochs=10)

# レビューごとの特徴量を作成（単語ベクトルの平均）
def get_review_vector(review, model):
    vectors = [model.wv[word] for word in review if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)  # ベクトルが空の場合はゼロベクトル

# 訓練データのレビューごとの特徴量
train_vectors = np.array([get_review_vector(review, model) for review in train_reviews])

# テストデータのレビューごとの特徴量
test_vectors = np.array([get_review_vector(review, model) for review in test_reviews])

# 結果をデータフレームに変換
train_doc2vec = pd.DataFrame(train_vectors, columns=[f'word2vec_{i}' for i in range(600)])
test_doc2vec = pd.DataFrame(test_vectors, columns=[f'word2vec_{i}' for i in range(600)])

# 元のデータフレームに結合
train = pd.concat([train.reset_index(drop=True), train_doc2vec], axis=1)
test = pd.concat([test.reset_index(drop=True), test_doc2vec], axis=1)

print("Word2Vec特徴量を追加しました。")
print(f"trainデータの形状: {train.shape}")
print(f"testデータの形状: {test.shape}")


Word2Vec特徴量を追加しました。
trainデータの形状: (283366, 616)
testデータの形状: (231845, 615)


In [None]:
#②−２ Negative_ReviewのWord2Vec特徴量
def preprocess_text(text):
    return text.lower().split()

# Negative_Review列の前処理
train['Negative_Review'].fillna("", inplace=True)
test['Negative_Review'].fillna("", inplace=True)

# Negative_Reviewをトークン化
train_reviews = train['Negative_Review'].apply(preprocess_text)
test_reviews = test['Negative_Review'].apply(preprocess_text)

# Word2Vecモデルの訓練
all_reviews = train_reviews.tolist() + test_reviews.tolist()  # 訓練とテストのレビューを結合
model = Word2Vec(sentences=all_reviews, vector_size=600, window=5, min_count=2, workers=4, epochs=10)

# レビューごとの特徴量を作成（単語ベクトルの平均）
def get_review_vector(review, model):
    vectors = [model.wv[word] for word in review if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)  # ベクトルが空の場合はゼロベクトル

# 訓練データのレビューごとの特徴量
train_vectors = np.array([get_review_vector(review, model) for review in train_reviews])

# テストデータのレビューごとの特徴量
test_vectors = np.array([get_review_vector(review, model) for review in test_reviews])

# 結果をデータフレームに変換
train_doc2vec = pd.DataFrame(train_vectors, columns=[f'word2vec_{i}+1' for i in range(600)])
test_doc2vec = pd.DataFrame(test_vectors, columns=[f'word2vec_{i}+1' for i in range(600)])

# 元のデータフレームに結合
train = pd.concat([train.reset_index(drop=True), train_doc2vec], axis=1)
test = pd.concat([test.reset_index(drop=True), test_doc2vec], axis=1)

print("Word2Vec特徴量を追加しました。")
print(f"trainデータの形状: {train.shape}")
print(f"testデータの形状: {test.shape}")


Word2Vec特徴量を追加しました。
trainデータの形状: (283366, 1216)
testデータの形状: (231845, 1215)


In [None]:
#③頻出単語
from sklearn.feature_extraction.text import CountVectorizer

# Positive_Review と Negative_Review 列から頻出単語を抽出する関数
def extract_frequent_words(df, review_col, top_n=1000):
    # CountVectorizerを使って単語のカウントを行う
    vectorizer = CountVectorizer(stop_words='english', max_features=top_n)
    word_counts = vectorizer.fit_transform(df[review_col].fillna(''))

    # 単語とその出現回数をDataFrameに変換
    word_freq = pd.DataFrame(word_counts.toarray(), columns=vectorizer.get_feature_names_out())

    # 上位top_nの頻出単語を取得
    top_words = word_freq.sum().sort_values(ascending=False).head(top_n)
    return top_words.index.tolist()

# 例えば、Positive_Review と Negative_Review の頻出単語を抽出
top_positive_words = extract_frequent_words(train, 'Positive_Review', top_n=1000)
top_negative_words = extract_frequent_words(train, 'Negative_Review', top_n=1000)


# 上記の単語に基づいて特徴量を追加する関数
def add_frequent_word_features(df, positive_words, negative_words):
    # Positive_Reviewの頻出単語の出現頻度をカウントして特徴量に追加
    for word in positive_words:
        df[f'Positive_contains_{word}'] = df['Positive_Review'].str.lower().str.count(word)

    # Negative_Reviewの頻出単語の出現頻度をカウントし、逆数を特徴量として追加
    for word in negative_words:
        df[f'Negative_contains_{word}'] = 1 / (1 + df['Negative_Review'].str.lower().str.count(word))

    return df

# 頻出単語に基づいてtrainとtestデータに特徴量を追加
train = add_frequent_word_features(train, top_positive_words, top_negative_words)
test = add_frequent_word_features(test, top_positive_words, top_negative_words)


In [None]:
#④感情分析
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# nltkの辞書データをダウンロード（初回のみ必要）
nltk.download('vader_lexicon')

# VADERの感情分析ツールをインスタンス化
sia = SentimentIntensityAnalyzer()

# 感情スコアの追加処理を行う関数
def add_sentiment_scores(df):
    # Positive_Review の感情スコアを追加
    df['Positive_Review_Sentiment'] = df['Positive_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])

    # Negative_Review の感情スコアを追加
    df['Negative_Review_Sentiment'] = df['Negative_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])

    return df

# 感情スコアを追加
train = add_sentiment_scores(train)
test = add_sentiment_scores(test)


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [None]:
#⑤N-gram
from sklearn.feature_extraction.text import CountVectorizer

def add_ngram_features(train, test, ngram_range=(2, 3), top_n=200):

    # N-gramを計算するためのCountVectorizer
    vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words='english')

    for col in ['Positive_Review']:
        # 訓練データからN-gramの頻出語句を取得
        vectorizer.fit(train[col])

        # 訓練データとテストデータをN-gramの出現頻度で変換
        train_ngram = vectorizer.transform(train[col])
        test_ngram = vectorizer.transform(test[col])

        # 頻出フレーズを取得
        feature_names = vectorizer.get_feature_names_out()
        freqs = train_ngram.sum(axis=0).A1
        top_phrases_idx = freqs.argsort()[::-1][:top_n]
        top_phrases = [feature_names[i] for i in top_phrases_idx]

        # 各フレーズの出現数を特徴量として追加
        for phrase in top_phrases:
            train[f'{col}_{phrase}_count'] = train_ngram[:, vectorizer.vocabulary_.get(phrase)].toarray()
            test[f'{col}_{phrase}_count'] = test_ngram[:, vectorizer.vocabulary_.get(phrase)].toarray()

    return train, test

# trainとtestデータにN-gram特徴量を追加
train, test = add_ngram_features(train, test, ngram_range=(2, 3), top_n=200)



In [None]:
#⑤-2

def add_inverse_ngram_features(train, test, ngram_range=(2, 3), top_n=200):
    # N-gramを計算するためのCountVectorizer
    vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words='english')

    for col in ['Negative_Review']:
        # 訓練データからN-gramの頻出語句を取得
        vectorizer.fit(train[col])

        # 訓練データとテストデータをN-gramの出現頻度で変換
        train_ngram = vectorizer.transform(train[col])
        test_ngram = vectorizer.transform(test[col])

        # 頻出フレーズを取得
        feature_names = vectorizer.get_feature_names_out()
        freqs = train_ngram.sum(axis=0).A1
        top_phrases_idx = freqs.argsort()[::-1][:top_n]
        top_phrases = [feature_names[i] for i in top_phrases_idx]

        # 各フレーズの出現数を特徴量として追加
        for phrase in top_phrases:
            train[f'{col}_{phrase}_inverse_count'] = train_ngram[:, vectorizer.vocabulary_.get(phrase)].toarray()
            test[f'{col}_{phrase}_inverse_count'] = test_ngram[:, vectorizer.vocabulary_.get(phrase)].toarray()

            # 逆数特徴量を追加
            train[f'{col}_{phrase}_inverse_count'] = 1 / (1 + train[f'{col}_{phrase}_inverse_count'])
            test[f'{col}_{phrase}_inverse_count'] = 1 / (1 + test[f'{col}_{phrase}_inverse_count'])

        # Negative_Review の全体の N-gram 数の合計を計算
        train[f'{col}_ngram_total_inverse'] = train_ngram.sum(axis=1).A1
        test[f'{col}_ngram_total_inverse'] = test_ngram.sum(axis=1).A1

        # 逆数特徴量（全体）を追加
        train[f'{col}_ngram_total_inverse'] = 1 / (1 + train[f'{col}_ngram_total_inverse'])
        test[f'{col}_ngram_total_inverse'] = 1 / (1 + test[f'{col}_ngram_total_inverse'])

    return train, test

# 実行: trainとtestデータにN-gramの逆数特徴量を追加
train, test = add_inverse_ngram_features(train, test, ngram_range=(2, 3), top_n=200)



In [None]:
#⑥TF-IDFスコア　Positive_Review
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Positive_Review列の欠損値を空文字で埋める
train['Positive_Review'] = train['Positive_Review'].fillna('')
test['Positive_Review'] = test['Positive_Review'].fillna('')

# TF-IDFの設定
tfidf = TfidfVectorizer(max_features=200, stop_words='english')  # 上位200単語を使用
tfidf_matrix_train = tfidf.fit_transform(train['Positive_Review'])
tfidf_matrix_test = tfidf.transform(test['Positive_Review'])

# 単語の重要度をデータフレーム化
tfidf_df_train = pd.DataFrame(tfidf_matrix_train.toarray(), columns=tfidf.get_feature_names_out())
tfidf_df_test = pd.DataFrame(tfidf_matrix_test.toarray(), columns=tfidf.get_feature_names_out())

# 上位単語のTF-IDFスコアの合計を新しい特徴量として追加
train['Positive_TFIDF_Sum'] = tfidf_df_train.sum(axis=1)
test['Positive_TFIDF_Sum'] = tfidf_df_test.sum(axis=1)

# デバッグ用: 作成した特徴量の確認
print(train[['Positive_TFIDF_Sum']].head())
print(test[['Positive_TFIDF_Sum']].head())


   Positive_TFIDF_Sum
0            1.940665
1            3.205895
2            1.000000
3            2.299748
4            2.168821
   Positive_TFIDF_Sum
0            1.000000
1            1.896997
2            1.861547
3            2.511879
4            3.170238


In [None]:
#⑥−２　TF-IDFスコア　Negative_Review
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Negative_Review列の欠損値を空文字で埋める
train['Negative_Review'] = train['Negative_Review'].fillna('')
test['Negative_Review'] = test['Negative_Review'].fillna('')

# TF-IDFの設定
tfidf = TfidfVectorizer(max_features=200, stop_words='english')  # 上位200単語を使用
tfidf_matrix_train = tfidf.fit_transform(train['Negative_Review'])
tfidf_matrix_test = tfidf.transform(test['Negative_Review'])

# 単語の重要度をデータフレーム化
tfidf_df_train = pd.DataFrame(tfidf_matrix_train.toarray(), columns=tfidf.get_feature_names_out())
tfidf_df_test = pd.DataFrame(tfidf_matrix_test.toarray(), columns=tfidf.get_feature_names_out())

# 上位単語のTF-IDFスコアの合計を新しい特徴量として追加
train['Negative_TFIDF_Sum'] = tfidf_df_train.sum(axis=1)
test['Negative_TFIDF_Sum'] = tfidf_df_test.sum(axis=1)

# 逆数を計算する前に0を回避するための小さな値（epsilon）を加える
epsilon = 1e-9  # 極小値
train['Negative_TFIDF_Sum'] = 1 / (train['Negative_TFIDF_Sum'] + epsilon)
test['Negative_TFIDF_Sum'] = 1 / (test['Negative_TFIDF_Sum'] + epsilon)

# デバッグ用: 作成した逆数特徴量の確認
print(train[['Negative_TFIDF_Sum']].head())
print(test[['Negative_TFIDF_Sum']].head())



   Negative_TFIDF_Sum
0        1.000000e+09
1        1.000000e+09
2        1.000000e+00
3        1.000000e+00
4        7.087195e-01
   Negative_TFIDF_Sum
0        1.000000e+00
1        4.156796e-01
2        1.000000e+09
3        7.071068e-01
4        5.256665e-01


In [None]:
# One-Hotエンコーディング Negative_Review 対象となる文字列リスト
keywords = ['Nothing', 'Negative','No','perfect']

# 特定の文字列を含むかどうかでOne-Hotエンコーディングを行う
for keyword in keywords:
     train[f'contains_{keyword}'] = train['Negative_Review'].apply(lambda x: 1 if keyword in x else 0)
     test[f'contains_{keyword}'] = test['Negative_Review'].apply(lambda x: 1 if keyword in x else 0)


In [None]:
# Positive_Review ネガティブな意見の場合は0、それ以外は1
train['disappointed'] = train['Positive_Review'].apply(
    lambda x: 0 if pd.isna(x) or 'No Positive' in x or 'Nothing' in x or 'nothing' in x or 'Not much' in  x else 1
)

# 推論用データセットにも同様の処理を実施
test['disappointed'] = test['Positive_Review'].apply(
    lambda x: 0 if pd.isna(x) or 'No Positive' in x or 'Nothing' in x or 'nothing' in x or 'Not much' in  x else 1
)


In [None]:
#住所から緯度経度の欠損値を埋める
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder


# 前処理: Hotel_Address を数値に変換
def encode_address(train):
    le = LabelEncoder()
    train['Hotel_Address_encoded'] = le.fit_transform(train['Hotel_Address'])
    return train

def encode_address(test):
    le = LabelEncoder()
    test['Hotel_Address_encoded'] = le.fit_transform(test['Hotel_Address'])
    return test

# 緯度および経度の欠損値を補完する関数
def impute_lat_lng(train):
    # Hotel_Address をエンコード
    train = encode_address(train)

    # 緯度の欠損値補完
    lat_train = train.dropna(subset=['lat'])
    lat_missing = train[train['lat'].isna()]

    if not lat_missing.empty:
        # 緯度の回帰モデルを構築し、欠損値を予測
        lat_model = LinearRegression()
        lat_model.fit(lat_train[['Hotel_Address_encoded']], lat_train['lat'])
        train.loc[train['lat'].isna(), 'lat'] = lat_model.predict(lat_missing[['Hotel_Address_encoded']])

    # 経度の欠損値補完
    lng_train = train.dropna(subset=['lng'])
    lng_missing = train[train['lng'].isna()]

    if not lng_missing.empty:
        # 経度の回帰モデルを構築し、欠損値を予測
        lng_model = LinearRegression()
        lng_model.fit(lng_train[['Hotel_Address_encoded']], lng_train['lng'])
        train.loc[train['lng'].isna(), 'lng'] = lng_model.predict(lng_missing[['Hotel_Address_encoded']])

    return train

def impute_lat_lng(test):
    # Hotel_Address をエンコード
    test = encode_address(test)

    # 緯度の欠損値補完
    lat_test = test.dropna(subset=['lat'])
    lat_missing = test[test['lat'].isna()]

    if not lat_missing.empty:
        # 緯度の回帰モデルを構築し、欠損値を予測
        lat_model = LinearRegression()
        lat_model.fit(lat_test[['Hotel_Address_encoded']], lat_test['lat'])
        test.loc[test['lat'].isna(), 'lat'] = lat_model.predict(lat_missing[['Hotel_Address_encoded']])

    # 経度の欠損値補完
    lng_test = test.dropna(subset=['lng'])
    lng_missing = test[test['lng'].isna()]

    if not lng_missing.empty:
        # 経度の回帰モデルを構築し、欠損値を予測
        lng_model = LinearRegression()
        lng_model.fit(lng_test[['Hotel_Address_encoded']], lng_test['lng'])
        test.loc[test['lng'].isna(), 'lng'] = lng_model.predict(lng_missing[['Hotel_Address_encoded']])

    return test

# データフレームで欠損値補完を実行
train = impute_lat_lng(train)
test = impute_lat_lng(test)

# 補完結果の確認
print(train[['Hotel_Address', 'lat', 'lng']].head())
print(test[['Hotel_Address', 'lat', 'lng']].head())



                                       Hotel_Address        lat       lng
0  18 Albert Embankment Lambeth London SE1 7TJ Un...  51.491374 -0.121419
1  12 Boulevard Haussmann 9th arr 75009 Paris France  48.872459  2.337800
2   Lakeside Way Brent London HA9 0BU United Kingdom  51.557696 -0.283526
3  Herengracht 519 525 Amsterdam City Center 1017...  52.365387  4.893198
4    5 rue du 8 Mai 1945 10th arr 75010 Paris France  48.875898  2.359050
                                       Hotel_Address        lat        lng
0  34 Norfolk Place Paddington Westminster Boroug...  51.516288  -0.172108
1  372 Strand Westminster Borough London WC2R 0JJ...  51.511099  -0.120867
2      Schlossallee 8 14 Penzing 1140 Vienna Austria  48.191339  16.316587
3  52 56 Inverness Terrace Westminster Borough Lo...  51.512397  -0.186124
4   Via Morigi 2 Milan City Center 20123 Milan Italy  45.462297   9.181470


In [None]:
def extract_days_since_review(df):
    # days_since_review カラムから数字部分を抽出して新しいカラムに追加
    df['days_since_review_numeric'] = df['days_since_review'].str.extract(r'(\d+)').astype('int64')
    return df

# 学習データとテストデータに対して特徴量を追加
train = extract_days_since_review(train)
test = extract_days_since_review(test)


In [None]:
# Additional_Number_of_Scoringの逆数を追加する関数
def add_inverse_scoring_feature(df):
    # Additional_Number_of_Scoringが0の場合はNaNを避けるため、0以外の値に逆数を取る
    df['Inverse_Additional_Scoring'] = df['Additional_Number_of_Scoring'].apply(
        lambda x: 1/x if x > 0 else 0  # 0の場合は0を返す
    )
    return df

# 学習データとテストデータに対して関数を適用
train = add_inverse_scoring_feature(train)
test = add_inverse_scoring_feature(test)

# 結果を確認
print("Training data with new feature:")
print(train[['Additional_Number_of_Scoring', 'Inverse_Additional_Scoring']].head())
print("\nTesting data with new feature:")
print(test[['Additional_Number_of_Scoring', 'Inverse_Additional_Scoring']].head())

Training data with new feature:
   Additional_Number_of_Scoring  Inverse_Additional_Scoring
0                          1195                    0.000837
1                            78                    0.012821
2                          1427                    0.000701
3                           141                    0.007092
4                           710                    0.001408

Testing data with new feature:
   Additional_Number_of_Scoring  Inverse_Additional_Scoring
0                           634                    0.001577
1                          2288                    0.000437
2                           251                    0.003984
3                           545                    0.001835
4                            69                    0.014493


In [None]:
# Total_Number_of_Reviewsの逆数を追加する関数
def add_review_score_feature(df):
    # Total_Number_of_Reviewsが少ないほどスコアが高くなる新しいカラムを追加
    # 例: 高いレビュー数に対して低いスコアを付与（逆数を取る）
    df['Inverse_Review_Score'] = df['Total_Number_of_Reviews'].apply(lambda x: 1 / (x + 1))  # +1でゼロ除算を回避

    return df

# 学習データとテストデータに対して特徴量を追加
train = add_review_score_feature(train)
test = add_review_score_feature(test)

# 結果を確認
print("Train Data:")
print(train[['Total_Number_of_Reviews', 'Inverse_Review_Score']].head())

print("\nTest Data:")
print(test[['Total_Number_of_Reviews', 'Inverse_Review_Score']].head())

Train Data:
   Total_Number_of_Reviews  Inverse_Review_Score
0                     4684              0.000213
1                      515              0.001938
2                     4305              0.000232
3                      803              0.001244
4                     6511              0.000154

Test Data:
   Total_Number_of_Reviews  Inverse_Review_Score
0                     4065              0.000246
1                     9568              0.000105
2                     3667              0.000273
3                     2907              0.000344
4                      974              0.001026


In [None]:
from datetime import datetime

# 曜日、月、四半期を判定する関数
def extract_date_features(df):
    # 日付から曜日、月、四半期を抽出
    df['Weekday'] = df['Review_Date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').strftime('%A'))
    df['Month'] = df['Review_Date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').month)
    df['Quarter'] = df['Review_Date'].apply(lambda x: (datetime.strptime(x, '%m/%d/%Y').month - 1) // 3 + 1)

    # One-Hot エンコーディングを手動で実施（曜日）
    weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    for day in weekdays:
        df[day] = (df['Weekday'] == day).astype(int)

    # One-Hot エンコーディングを手動で実施（月）
    for month in range(1, 13):
        df[f'Month_{month}'] = (df['Month'] == month).astype(int)

    # One-Hot エンコーディングを手動で実施（四半期）
    for quarter in range(1, 5):
        df[f'Quarter_{quarter}'] = (df['Quarter'] == quarter).astype(int)

    # 不要なカラムを削除
    df = df.drop(columns=['Weekday', 'Month', 'Quarter'])

    return df

# 学習データとテストデータに対して特徴量を追加
train = extract_date_features(train)
test = extract_date_features(test)


In [None]:
from sklearn.preprocessing import LabelEncoder

# ラベルエンコーダーをインスタンス化
encoder = LabelEncoder()

# ラベルエンコーディングする対象のカラムリスト
categorical_columns = ['Hotel_Address','Hotel_Name','Tags']  # 例としてカラム名を指定

# 各カテゴリカルカラムに対してラベルエンコーディングを適用
for col in categorical_columns:
    train[col] = encoder.fit_transform(train[col])

categorical_columns = ['Hotel_Address','Hotel_Name','Tags']  # 例としてカラム名を指定

# 各カテゴリカルカラムに対してラベルエンコーディングを適用
for col in categorical_columns:
    test[col] = encoder.fit_transform(test[col])


In [None]:
train = train.drop(['Additional_Number_of_Scoring','Total_Number_of_Reviews'], axis=1)
test = test.drop(['Additional_Number_of_Scoring','Total_Number_of_Reviews'], axis=1)

In [None]:
train.head()

Unnamed: 0,Hotel_Address,Review_Date,Average_Score,Hotel_Name,Negative_Review,Positive_Review,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,...,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12,Quarter_1,Quarter_2,Quarter_3,Quarter_4
0,210,1/28/2017,8.3,1140,Nothing,The whole experience was fantastic From the c...,2,10.0,28842,187 day,...,0,0,0,0,0,0,1,0,0,0
1,98,3/23/2017,7.8,1021,N A,Everything comfortable and cosy rooms which w...,1,10.0,17578,133 day,...,0,0,0,0,0,0,1,0,0,0
2,1048,5/21/2017,8.8,520,Everything good that I had,Didn t have them,1,8.8,38462,74 days,...,0,0,0,0,0,0,0,1,0,0
3,990,2/4/2017,9.2,109,Slippers are small,Really great staff Very good open bar drinks ...,29,9.2,5072,180 day,...,0,0,0,0,0,0,1,0,0,0
4,590,1/21/2016,7.9,544,Bad quality everything,Perfect location near by the metro station,5,7.1,20977,560 day,...,0,0,0,0,0,0,1,0,0,0


In [None]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor  # メタモデルに使用
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

# ランダムシードの設定
random_seed = 42
np.random.seed(random_seed)

# 訓練用データセットの数値型以外の列を削除する
train = train.select_dtypes(include=['int64', 'float64', 'int32'])
test = test[[col for col in train.columns if col in test.columns]]

# 訓練用データセットからターゲットを分離する
X = train.drop('Reviewer_Score', axis=1)
y = train['Reviewer_Score']

# KFoldによる分割設定
kf = KFold(n_splits=5, shuffle=True, random_state=random_seed)

# LightGBMのハイパーパラメータ
lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.01,
    'num_leaves': 31,
    'max_depth': 7,
    'min_data_in_leaf': 50,
    'verbose': -1,
    'random_state': random_seed
}

# XGBoostのハイパーパラメータ
xgb_params = {
    'objective': 'reg:squarederror',
    'learning_rate': 0.05,
    'max_depth': 8,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': random_seed
}

# テストデータのスタッキング用予測を初期化
stacked_test_preds = np.zeros((len(test), 2))  # 2つのモデル（LGBM, XGB）

# 各Foldでの学習と評価
stacked_train_preds = np.zeros((len(X), 2))  # 訓練データのスタッキング用予測

for fold, (train_index, valid_index) in enumerate(kf.split(X)):
    print(f"Fold {fold+1}")

    # 訓練用データと検証用データに分割
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    # LightGBMの訓練
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

    lgb_model = lgb.train(
        lgb_params,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        callbacks=[lgb.early_stopping(stopping_rounds=50)]
    )

    # XGBoostの訓練
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

    xgb_model = xgb.train(
        xgb_params,
        dtrain,
        num_boost_round=1000,
        evals=watchlist,
        early_stopping_rounds=50,
        verbose_eval=10
    )

    # 検証データでの予測（スタッキング用特徴量を作成）
    stacked_train_preds[valid_index, 0] = lgb_model.predict(X_valid, num_iteration=lgb_model.best_iteration)
    stacked_train_preds[valid_index, 1] = xgb_model.predict(xgb.DMatrix(X_valid), iteration_range=(0, xgb_model.best_iteration))

    # テストデータの予測を加算（後で平均化）
    lgb_test_pred = lgb_model.predict(test, num_iteration=lgb_model.best_iteration)
    xgb_test_pred = xgb_model.predict(xgb.DMatrix(test), iteration_range=(0, xgb_model.best_iteration))
    stacked_test_preds[:, 0] += lgb_test_pred
    stacked_test_preds[:, 1] += xgb_test_pred

# テストデータのスタッキング用特徴量を平均化
stacked_test_preds /= kf.n_splits

# スタッキングモデル（メタモデル）で最終予測
meta_model = GradientBoostingRegressor(random_state=random_seed)
meta_model.fit(stacked_train_preds, y)
final_preds = meta_model.predict(stacked_test_preds)

# 最終結果
print(f"Final predictions shape: {final_preds.shape}")


Fold 1
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 1.01698
[0]	train-rmse:1.60035	eval-rmse:1.59754
[10]	train-rmse:1.31763	eval-rmse:1.32817
[20]	train-rmse:1.16957	eval-rmse:1.19267
[30]	train-rmse:1.08718	eval-rmse:1.12227
[40]	train-rmse:1.03786	eval-rmse:1.08460
[50]	train-rmse:1.00594	eval-rmse:1.06276
[60]	train-rmse:0.98344	eval-rmse:1.04909
[70]	train-rmse:0.96653	eval-rmse:1.03985
[80]	train-rmse:0.95322	eval-rmse:1.03310
[90]	train-rmse:0.94183	eval-rmse:1.02772
[100]	train-rmse:0.93275	eval-rmse:1.02391
[110]	train-rmse:0.92363	eval-rmse:1.02057
[120]	train-rmse:0.91533	eval-rmse:1.01759
[130]	train-rmse:0.90800	eval-rmse:1.01518
[140]	train-rmse:0.90175	eval-rmse:1.01310
[150]	train-rmse:0.89440	eval-rmse:1.01111
[160]	train-rmse:0.88796	eval-rmse:1.00960
[170]	train-rmse:0.88229	eval-rmse:1.00819
[180]	train-rmse:0.87570	eval-rmse:1.00678
[190]	train-rmse:0.87086	eval-rmse:1.00576
[200

In [None]:
print(final_preds)

[6.33294247 8.32794681 9.48459962 ... 8.43927426 8.74399036 9.33915073]


In [None]:
# 提出用ファイル作成
submit = pd.read_csv('/content/drive/MyDrive/data/sample_submission.csv',header=None)
submit[1] = final_preds
submit.to_csv('/content/drive/MyDrive/data/submission_tutorial.csv', header=None,index=False)


In [None]:
submit.head()


Unnamed: 0,0,1
0,283366,6.332942
1,283367,8.327947
2,283368,9.4846
3,283369,9.112178
4,283370,8.439274
