In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")

In [None]:
target_cols = ['Positive', 'Negative']
y = train_df[target_cols].copy()
train_df = train_df.drop(target_cols, axis=1)

train_df['_is_train'] = True
test_df['_is_train'] = False

full_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)

In [None]:
drop_cols = ['id', 'Header image', 'Screenshots']
full_df.drop(columns=drop_cols, inplace=True)

In [None]:
full_df['review_length_words'] = full_df['Reviews'].fillna('').apply(lambda x: len(x.split()))

In [None]:
full_df['has_notes'] = full_df['Notes'].notna().astype(int)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

genres_split = full_df['Genres'].fillna('').apply(lambda x: [g.strip() for g in x.split(',')])

mlb = MultiLabelBinarizer()
genres_encoded = pd.DataFrame(mlb.fit_transform(genres_split), columns=mlb.classes_, index=full_df.index)

full_df = pd.concat([full_df, genres_encoded], axis=1)

In [None]:
full_df['description_length_words'] = full_df['About the game'].fillna('').apply(lambda x: len(x.split()))

In [None]:
languages_split = full_df['Supported languages'].fillna('').apply(lambda x: [g.strip() for g in x.split(',')])

mlb = MultiLabelBinarizer()
languages_encoded = pd.DataFrame(mlb.fit_transform(languages_split), columns=mlb.classes_, index=full_df.index)

full_df = pd.concat([full_df, languages_encoded], axis=1)

In [None]:
audio_split = full_df['Full audio languages'].fillna('').apply(lambda x: [g.strip() for g in x.split(',')])

mlb = MultiLabelBinarizer()
audio_encoded = pd.DataFrame(mlb.fit_transform(audio_split), columns=mlb.classes_, index=full_df.index)

full_df = pd.concat([full_df, audio_encoded], axis=1)

In [None]:
for col in ['Mac', 'Linux', 'Windows']:
    if col in full_df.columns:
        full_df[col] = full_df[col].astype(int)

In [None]:
developer_counts = full_df['Developers'].value_counts()

full_df['developer_popularity'] = full_df['Developers'].map(developer_counts-1)

full_df[['Developers', 'developer_popularity']]

Unnamed: 0,Developers,developer_popularity
0,HeR Interactive,24.0
1,gleogames,0.0
2,"peakvox,O-TWO inc.",3.0
3,Neon Orbis,0.0
4,James Olks,0.0
...,...,...
70185,Kobold Workforce Studios,0.0
70186,Volcanicc,0.0
70187,Flannel Bear Games,1.0
70188,Dreamlight Games Studios SL,1.0


In [None]:
categories_split = full_df['Categories'].fillna('').apply(lambda x: [g.strip() for g in x.split(',')])

mlb = MultiLabelBinarizer()
categories_encoded = pd.DataFrame(mlb.fit_transform(categories_split), columns=mlb.classes_, index=full_df.index)

full_df = pd.concat([full_df, categories_encoded], axis=1)

In [None]:
from datetime import datetime
full_df['Release date'] = pd.to_datetime(full_df['Release date'], errors='coerce')

now = pd.Timestamp(datetime.now())
full_df['release_age_months'] = ((now - full_df['Release date']) / pd.Timedelta(days=30)).astype('float')

In [None]:
drop_cols_final = ['Release date',
                   'About the game',
                   'Supported languages',
                   'Full audio languages',
                   'Reviews',
                   'Notes',
                   'Developers',
                   'Publishers',
                   'Categories',
                   'Genres',
                   'Tags'
                  ]
full_df.drop(columns=drop_cols_final, inplace=True)

In [None]:
full_df = full_df.fillna(0)

In [None]:
full_df["log_price"] = np.log1p(full_df["Price"])
full_df["log_developer_popularity"] = np.log1p(full_df["developer_popularity"])
full_df["log_review_length_words"] = np.log1p(full_df["review_length_words"])
full_df["log_release_age_months"] = np.log1p(full_df["release_age_months"])

full_df["is_free"] = (full_df["Price"] == 0).astype(int)

full_df.drop(columns=[
    "Price",
    "developer_popularity",
    "review_length_words",
    "release_age_months"
], inplace=True)

In [None]:
train_processed = full_df[full_df['_is_train'] == True].copy()
test_processed = full_df[full_df['_is_train'] == False].copy()
train_processed = train_processed.drop('_is_train', axis=1)
test_processed = test_processed.drop('_is_train', axis=1)
assert set(train_processed.columns) == set(test_processed.columns), "Разные признаки!"

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import HistGradientBoostingRegressor
X = train_processed
target_cols = ['Positive', 'Negative']
y_log = np.log1p(y)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = {col: np.zeros(len(X)) for col in target_cols}

mae_scores = {}
params = {
    'max_leaf_nodes': 31,
    'max_iter': 300,
    'learning_rate': 0.1,
    'l2_regularization': 0.5,
    'random_state': 42
}


for col in target_cols:
    for train_idx, val_idx in kf.split(X):
        model = HistGradientBoostingRegressor(**params)
        model.fit(X.iloc[train_idx], y_log[col].iloc[train_idx])
        preds_log = model.predict(X.iloc[val_idx])
        preds = np.expm1(preds_log)
        oof_preds[col][val_idx] = preds
    mae_scores[col] = mean_absolute_error(y[col], oof_preds[col])

total_mae = mean_absolute_error(
    np.concatenate([y['Positive'], y['Negative']]),
    np.concatenate([oof_preds['Positive'], oof_preds['Negative']])
)

print("MAE по каждому таргету:")
for col in target_cols:
    print(f"{col}: {mae_scores[col]:.4f}")

print(f"\n Сводная MAE: {total_mae:.4f}")

MAE по каждому таргету:
Positive: 483.9202
Negative: 88.5127

 Сводная MAE: 286.2165


In [28]:


import numpy as np
import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
params = {
    'max_leaf_nodes': 31,
    'max_iter': 300,
    'learning_rate': 0.1,
    'l2_regularization': 0.5,
    'random_state': 42
}
negative_model = HistGradientBoostingRegressor(**params)
negative_model.fit(train_processed, y_log['Negative'])
negative_preds_log = negative_model.predict(test_processed)
negative_preds = np.expm1(negative_preds_log).round().astype(int)

positive_model = HistGradientBoostingRegressor(**params)
positive_model.fit(train_processed, y_log['Positive'])
positive_preds_log = positive_model.predict(test_processed)
positive_preds = np.expm1(positive_preds_log).round().astype(int)


KeyError: 'id'

In [33]:
submission = pd.DataFrame({
    'id': test_df['id'].astype(int),
    'Positive': positive_preds,
    'Negative': negative_preds
})

submission.to_csv("submission2.csv", index=False)
print("готов")

готов
