In [24]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, accuracy_score, r2_score

In [53]:
url = "https://raw.githubusercontent.com/AnnaNebuko/ABD-PRJ-25-2team-HAPPINESS/main/data_cleaned.csv"
df = pd.read_csv(url)
df = df.drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,idno,agea,rlgdgr,polintr,sclmeet,happy,cntry,nwspol,pplfair,health,...,actrolga,cptppola,trstprl,trstlgl,trstplc,trstplt,vote,stfeco,stfdem,stfedu
0,50030,21.0,0.0,2.0,7.0,9.0,AT,90.0,0.0,2.0,...,2.0,2.0,6.0,6.0,4.0,1.0,1.0,2.0,7.0,10.0
1,50057,53.0,8.0,2.0,4.0,9.0,AT,30.0,9.0,1.0,...,4.0,3.0,7.0,5.0,8.0,4.0,1.0,6.0,6.0,5.0
2,50106,78.0,6.0,3.0,6.0,7.0,AT,15.0,6.0,3.0,...,2.0,3.0,5.0,6.0,9.0,3.0,2.0,4.0,6.0,5.0
3,50145,64.0,1.0,2.0,5.0,9.0,AT,60.0,3.0,2.0,...,1.0,3.0,6.0,8.0,8.0,5.0,1.0,6.0,8.0,9.0
4,50158,59.0,3.0,2.0,6.0,8.0,AT,120.0,8.0,1.0,...,3.0,3.0,3.0,5.0,7.0,5.0,1.0,4.0,3.0,3.0


In [54]:
df['happy'] = df['happy'].astype(int)

X = df.drop('happy', axis=1)
y = df['happy']

In [55]:
# закодируем категориальные признаки
X = pd.get_dummies(X, columns=['cntry', 'gndr'], drop_first=True)

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [22]:
# Масштабирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
from scipy import stats

In [19]:
X_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)

In [81]:
from catboost import CatBoostRegressor

model_cat = CatBoostRegressor(random_state=42, verbose=0)

model_cat.fit(X_train_scaled, y_train)

y_pred_train_cat = model_cat.predict(X_train_scaled)
y_pred_test_cat = model_cat.predict(X_test_scaled)

R2_train_cat = r2_score(y_train, y_pred_train_cat)
R2_test_cat = r2_score(y_test, y_pred_test_cat)

MSE_train_cat = mean_squared_error(y_train, y_pred_train_cat)
MSE_test_cat = mean_squared_error(y_test, y_pred_test_cat)

print(f"R2 на трейне:{R2_train_cat:.3f}")
print(f"R2 на тесте:{R2_test_cat:.3f}")

print(f"accuracy на трейне:{accuracy_train_cat:.3f}")
print(f"accuracy на тесте:{accuracy_test_cat:.3f}")

print(f"MSE на трейне:{MSE_train_cat:.3f}")
print(f"MSE на тесте:{MSE_test_cat:.3f}")

print(f"Важность признаков:{model_cat.feature_importances_}")

R2 на трейне:0.742
R2 на тесте:0.351
accuracy на трейне:0.809
accuracy на тесте:0.343
MSE на трейне:0.657
MSE на тесте:1.654
Важность признаков:[3.589721   2.35235112 1.10754506 2.34644626 2.55387527 3.66350995
 4.78271433 3.78844689 1.22191562 0.25364891 0.94347087 2.41998416
 1.64714232 2.24395733 1.15150864 2.08930654 1.36683868 7.74089261
 5.42924742 1.43844083 1.49680816 1.63002171 1.25161747 3.01023566
 2.95226369 1.62982321 1.74113932 0.47698502 0.38087081 3.20079028
 0.23028245 0.94359458 1.31040803 2.00308925 2.2382095  3.65660412
 1.70020665 0.84066863 5.32408666 2.88719464 2.68875206 0.14825414
 0.10359137 0.78582287 0.34278096 0.5398956  0.30380151 0.70608094
 0.47095927 1.81073193 1.06346571]


In [83]:
cat_coef_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': model_cat.feature_importances_
}).sort_values('Coefficient', ascending=False)

print(cat_coef_df.head(15))

    Feature  Coefficient
17   fltdpr     7.740893
18   fltlnl     5.429247
38   stfeco     5.324087
6    health     4.782714
7   atchctr     3.788447
5   pplfair     3.663510
35  trstplc     3.656604
0      agea     3.589721
29  hincfel     3.200790
23   height     3.010236
24  weighta     2.952264
39   stfdem     2.887195
40   stfedu     2.688752
4    nwspol     2.553875
11    hhmmb     2.419984


In [57]:
X_train = X_train.drop(columns=['enjlf','idno'])
X_test = X_test.drop(columns=['enjlf','idno'])

# Масштабирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



In [65]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.unique(y_train)
weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

In [None]:
from catboost import CatBoostClassifier

model_cat = CatBoostClassifier(
    loss_function='MultiClass',  # или 'Logloss' для бинарной классификации
    eval_metric='Accuracy',
    class_weights=class_weights,
    classes_count=11,
    random_seed=42
)
model_cat.fit(X_train_scaled, y_train)

y_pred_train_cat = model_cat.predict(X_train_scaled)
y_pred_test_cat = model_cat.predict(X_test_scaled)

R2_train_cat = r2_score(y_train, y_pred_train_cat)
R2_test_cat = r2_score(y_test, y_pred_test_cat)

accuracy_train_cat = accuracy_score(y_train, y_pred_train_cat)
accuracy_test_cat = accuracy_score(y_test, y_pred_test_cat)

MSE_train_cat = mean_squared_error(y_train, y_pred_train_cat)
MSE_test_cat = mean_squared_error(y_test, y_pred_test_cat)

print(f"R2 на трейне:{R2_train_cat:.3f}")
print(f"R2 на тесте:{R2_test_cat:.3f}")

print(f"accuracy на трейне:{accuracy_train_cat:.3f}")
print(f"accuracy на тесте:{accuracy_test_cat:.3f}")

print(f"MSE на трейне:{MSE_train_cat:.3f}")
print(f"MSE на тесте:{MSE_test_cat:.3f}")

Learning rate set to 0.088891
0:	learn: 0.2669565	total: 13.6ms	remaining: 13.6s
1:	learn: 0.3366534	total: 26.5ms	remaining: 13.2s
2:	learn: 0.3786868	total: 39.5ms	remaining: 13.1s
3:	learn: 0.4208060	total: 52.6ms	remaining: 13.1s
4:	learn: 0.4301827	total: 66ms	remaining: 13.1s
5:	learn: 0.4455612	total: 79.3ms	remaining: 13.1s
6:	learn: 0.4706285	total: 92.3ms	remaining: 13.1s
7:	learn: 0.4858417	total: 106ms	remaining: 13.1s
8:	learn: 0.5031021	total: 119ms	remaining: 13.1s
9:	learn: 0.5046619	total: 131ms	remaining: 13s
10:	learn: 0.5153114	total: 144ms	remaining: 12.9s
11:	learn: 0.5245856	total: 158ms	remaining: 13s
12:	learn: 0.5305330	total: 171ms	remaining: 13s
13:	learn: 0.5344980	total: 184ms	remaining: 12.9s
14:	learn: 0.5454689	total: 197ms	remaining: 12.9s
15:	learn: 0.5557635	total: 211ms	remaining: 13s
16:	learn: 0.5658472	total: 224ms	remaining: 13s
17:	learn: 0.5605236	total: 238ms	remaining: 13s
18:	learn: 0.5675962	total: 251ms	remaining: 13s
19:	learn: 0.5731790

In [80]:
cat_coef_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': model_cat.feature_importances_
}).sort_values('Coefficient', ascending=False)

print(cat_coef_df.head(15))


    Feature  Coefficient
17   fltdpr     6.412698
38   stfeco     4.182357
18   fltlnl     3.943339
0      agea     3.866850
6    health     3.636436
35  trstplc     3.603374
7   atchctr     3.553073
5   pplfair     3.218443
23   height     3.039964
24  weighta     2.853745
20   dosprt     2.744876
40   stfedu     2.692600
29  hincfel     2.567694
1    rlgdgr     2.564918
11    hhmmb     2.452820
