In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.impute import SimpleImputer
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
data_train = pd.read_csv("/kaggle/input/titanic/train.csv")
data_test = pd.read_csv("/kaggle/input/titanic/test.csv")
data_gender_sub = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")

In [3]:
data_gender_sub

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [4]:
data_train['Cabin'].fillna(0, inplace=True)
data_train.fillna(0, inplace=True)
data_train = data_train.drop(columns = ["Name","Ticket","Cabin","PassengerId"])
data_train["Sex"] = data_train["Sex"].replace('male',0)
data_train["Sex"] = data_train["Sex"].replace('female',1)
data_train["Embarked"] = data_train["Embarked"].replace('S',1)
data_train["Embarked"] = data_train["Embarked"].replace('C',2)
data_train["Embarked"] = data_train["Embarked"].replace('Q',3)
data_train["Age"] = data_train["Age"].replace(0,data_train["Age"].median())

In [5]:
data_test.fillna({"Cabin": 0, "Age": data_test["Age"].median()}, inplace=True)
data_test.drop(columns=["Name", "Ticket", "Cabin", "PassengerId"], inplace=True)
data_test["Sex"] = data_test["Sex"].map({"male": 0, "female": 1})
data_test["Embarked"] = data_test["Embarked"].map({"S": 1, "C": 2, "Q": 3})
data_test["Age"] = (data_test["Age"] - data_test["Age"].mean()) / data_test["Age"].std()
data_test["Fare"] = (data_test["Fare"] - data_test["Fare"].mean()) / data_test["Fare"].std()



In [6]:
data_train["Age"] = (data_train["Age"] - data_train["Age"].mean())/data_train["Age"].std()
data_train["Fare"] = (data_train["Fare"] - data_train["Fare"].mean())/data_train["Fare"].std()
target = data_train["Survived"]
data_train = data_train.drop(columns = ["Survived"])
target = target.values.reshape(-1, 1)

In [7]:
target.shape
data_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,-0.497514,1,0,-0.502163,1
1,1,1,0.714647,1,0,0.786404,2
2,3,1,-0.194474,0,0,-0.488580,1
3,1,1,0.487367,1,0,0.420494,1
4,3,0,0.487367,0,0,-0.486064,1
...,...,...,...,...,...,...,...
886,2,0,-0.118714,0,0,-0.386454,1
887,1,1,-0.724794,0,0,-0.044356,1
888,3,1,-0.345994,1,2,-0.176164,1
889,1,0,-0.194474,0,0,-0.044356,2


In [8]:
import numpy as np

class LogisticRegression:
    def __init__(self, learning_rate=0.01, max_iter=1000, tolerance=1e-4):
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.tolerance = tolerance
        self.weights = None
        self.loss_history = []

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def compute_loss(self, X, y, predict):
        return -1 / X.shape[0] * np.sum(y * np.log(predict) + (1 - y) * np.log(1 - predict))

    def fit(self, X, y, num_epoch):
        bias = np.ones((X.shape[0], 1))
        X = np.concatenate((bias, X), axis=1)
        num_features = X.shape[1]
        self.weights = np.zeros((X.shape[1], 1), dtype=np.float64)

        for epoch in range(num_epoch):
            z = X @ self.weights
            P = self.sigmoid(z)
            error = P - y
            N = X.shape[0]
            gradient = (1 / N) * X.T @ error
            self.weights = self.weights - self.learning_rate * gradient
            loss = self.compute_loss(X, y, P)
            self.loss_history.append(loss)
            

    def predict(self, X):
        bias = np.ones((X.shape[0], 1))
        X = np.concatenate((bias, X), axis=1) @ self.weights
        P = self.sigmoid(X)
        pred = (P > 0.45).astype(int)
        return pred
    def confusion_matrix_elements(self, y_true, y_pred):
        TP = np.sum((y_true == 1) & (y_pred == 1))
        FP = np.sum((y_true == 0) & (y_pred == 1))
        FN = np.sum((y_true == 1) & (y_pred == 0))
        TN = np.sum((y_true == 0) & (y_pred == 0))
        return TP, FP, FN, TN
    def accuracy_score(self,TP, FP, FN, TN):
        return (TP + TN) / (TP + FP + FN + TN)
    def recall_score(self,TP,FN):
        return TP / (TP + FN)
    def precision_score(self,TP,FP):
        return TP / (TP + FP)
    def plot_loss(self):
        """Строит график изменения функции потерь по итерациям."""


In [9]:
data_gender_sub = data_gender_sub.drop(columns = ["PassengerId"])

In [10]:
lr = 0.001
epochs = 200
model = LogisticRegression(learning_rate=lr, max_iter=epochs)
model.fit(data_train, target, num_epoch=epochs)
        
y_pred = model.predict(data_test)        
TP, FP, FN, TN = model.confusion_matrix_elements(data_gender_sub["Survived"], y_pred.flatten())
accuracy = model.accuracy_score(TP, FP, FN, TN)
recall = model.recall_score(TP, FN)
precision = model.precision_score(TP, FP)

In [11]:
print(TP, FP, FN, TN,accuracy,recall,precision)

80 114 72 152 0.5550239234449761 0.5263157894736842 0.41237113402061853


In [12]:

target.shape
data_train.shape


(891, 7)

In [13]:
imputer = SimpleImputer(strategy='mean')
data_train_imputed = imputer.fit_transform(data_train)
data_test_imputed = imputer.transform(data_test)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(data_train_imputed,target.ravel())
y_pred = model.predict(data_test_imputed)


analys = confusion_matrix(data_gender_sub["Survived"], y_pred.flatten())

In [14]:
print(analys)

[[237  29]
 [ 43 109]]
