In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

# Скачивание и анализ данных

In [2]:
df = pd.read_csv("./train_and_test2.csv")
df = df.dropna().sample(frac=1).reset_index(drop=True)
good_columns = [column for column in df.columns if "zero" not in column]
df = df[good_columns]
df = df.drop(columns=["Passengerid"])
df

Unnamed: 0,Age,Fare,Sex,sibsp,Parch,Pclass,Embarked,2urvived
0,20.0,4.0125,0,0,0,3,0.0,0
1,39.0,7.9250,0,0,0,3,2.0,1
2,28.0,52.0000,1,1,0,1,2.0,1
3,17.0,57.0000,1,1,0,1,2.0,1
4,22.0,7.2292,0,0,0,3,0.0,0
...,...,...,...,...,...,...,...,...
1302,48.0,39.6000,1,1,0,1,0.0,1
1303,24.0,16.1000,0,1,0,3,2.0,0
1304,28.0,52.0000,0,0,0,1,2.0,0
1305,28.0,13.0000,0,0,0,2,2.0,1


In [3]:
df.describe()

Unnamed: 0,Age,Fare,Sex,sibsp,Parch,Pclass,Embarked,2urvived
count,1307.0,1307.0,1307.0,1307.0,1307.0,1307.0,1307.0,1307.0
mean,29.471821,33.209595,0.355011,0.499617,0.385616,2.296863,1.492731,0.260138
std,12.881592,51.748768,0.4787,1.042273,0.866092,0.836942,0.814626,0.438877
min,0.17,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,22.0,7.8958,0.0,0.0,0.0,2.0,1.0,0.0
50%,28.0,14.4542,0.0,0.0,0.0,3.0,2.0,0.0
75%,35.0,31.275,1.0,1.0,0.0,3.0,2.0,1.0
max,80.0,512.3292,1.0,8.0,9.0,3.0,2.0,1.0


In [4]:
num_cols = [
    "Age",
    "Fare",
]
cat_cols = ["Embarked", "Pclass", "sibsp", "Parch", "Sex"]

df_num = df[num_cols]
df_cat = df[cat_cols]

# Обработка численных колонок

In [5]:
scaler = MinMaxScaler()
scaler.fit(df_num)
df_num.iloc[:, :] = scaler.transform(df_num)
df_num

Unnamed: 0,Age,Fare
0,0.248403,0.007832
1,0.486409,0.015469
2,0.348616,0.101497
3,0.210823,0.111257
4,0.273456,0.014110
...,...,...
1302,0.599148,0.077294
1303,0.298509,0.031425
1304,0.348616,0.101497
1305,0.348616,0.025374


# Обработка категориальных колонок

In [6]:
enc = OneHotEncoder(handle_unknown="ignore")
enc.fit(df_cat)
df_cat = pd.DataFrame(
    data=enc.transform(df_cat).toarray(), columns=enc.get_feature_names_out(cat_cols)
).astype(int)
df_cat = df_cat.astype("int64")
df_cat

Unnamed: 0,Embarked_0.0,Embarked_1.0,Embarked_2.0,Pclass_1,Pclass_2,Pclass_3,sibsp_0,sibsp_1,sibsp_2,sibsp_3,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Parch_9,Sex_0,Sex_1
0,1,0,0,0,0,1,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,0,0,1,0,0,1,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2,0,0,1,1,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
3,0,0,1,1,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
4,1,0,0,0,0,1,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1302,1,0,0,1,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
1303,0,0,1,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
1304,0,0,1,1,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1305,0,0,1,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0


# Собираем данные вместе

In [7]:
df_final = pd.concat([df_num, df_cat, df.iloc[:, -1]], axis=1)

X = df_final.iloc[:, :-1]
y = df_final.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Строим модель

In [8]:
model = LogisticRegression()
results = cross_validate(model, X, y, cv=5, scoring="accuracy")
results["test_score"].mean()

0.7804158989207686

In [9]:
tree_params = {
    "criterion": ["gini", "log_loss", "entropy"],
    "max_depth": [15, 20, 25],
    "min_samples_split": [2, 3, 4],
    "min_samples_leaf": [2, 3, 4],
}

In [72]:
grs = GridSearchCV(
    DecisionTreeClassifier(random_state=0),
    cv=5,
    param_grid=tree_params,
    n_jobs=-1,
    scoring="accuracy",
)
grs.fit(X, y)

In [73]:
grs.best_params_

{'criterion': 'gini',
 'max_depth': 20,
 'min_samples_leaf': 3,
 'min_samples_split': 2}

In [74]:
grs.best_score_

0.7589745839548419

In [14]:
boost_params = {
    "loss": ["log_loss", "exponential"],
    "learning_rate": [0.1, 0.2],
    "n_estimators": [70, 80, 90],
    "max_depth": [2, 3],
    "min_samples_leaf": [3, 4, 5],
    "min_samples_split": [2, 3],
}

In [15]:
grs = GridSearchCV(
    GradientBoostingClassifier(random_state=0),
    cv=5,
    param_grid=boost_params,
    n_jobs=-1,
    scoring="accuracy",
)
grs.fit(X, y)

In [17]:
grs.best_params_

{'learning_rate': 0.1,
 'loss': 'log_loss',
 'max_depth': 2,
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 80}

In [18]:
grs.best_score_

0.787294902167237

In [21]:
grs.cv_results_["mean_test_score"]

array([0.78347226, 0.78117341, 0.78270013, 0.78347226, 0.78117341,
       0.78270013, 0.78500483, 0.7872949 , 0.78577111, 0.78500483,
       0.7872949 , 0.78577111, 0.78271475, 0.78577696, 0.78347226,
       0.78271475, 0.78577696, 0.78347226, 0.77735076, 0.77428271,
       0.76892457, 0.77735076, 0.77428271, 0.76892457, 0.7804159 ,
       0.77199555, 0.76893627, 0.7804159 , 0.77199555, 0.76893627,
       0.77046299, 0.76970255, 0.76816999, 0.77046299, 0.76970255,
       0.76816999, 0.78653447, 0.78423854, 0.7781229 , 0.78653447,
       0.78423854, 0.7781229 , 0.78653154, 0.77887748, 0.77887748,
       0.78653154, 0.77887748, 0.77887748, 0.78654032, 0.78193677,
       0.78041005, 0.78654032, 0.78193677, 0.78041005, 0.77429148,
       0.77276476, 0.77582405, 0.77429148, 0.77276476, 0.77582405,
       0.77811412, 0.77505484, 0.77888918, 0.77811412, 0.77505484,
       0.77888918, 0.7735252 , 0.77275599, 0.77275599, 0.7735252 ,
       0.77275599, 0.77275599, 0.7720131 , 0.7697084 , 0.76970

In [None]:
# enc = OneHotEncoder(handle_unknown='ignore')
# enc.fit(df_cat)
# df_cat = pd.DataFrame( data = enc.transform(df_cat).toarray(), columns=enc.get_feature_names(cat_cols)).astype(int)
# df_cat = df_cat.astype('int64')

In [3]:
pip install mexc-sdk

Collecting mexc-sdk
  Downloading mexc_sdk-0.0.1-py3-none-any.whl (1.2 kB)
Installing collected packages: mexc-sdk
Successfully installed mexc-sdk-0.0.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import mexc_sdk

ModuleNotFoundError: No module named 'mexc_sdk'