**Hi! Here is a notebook for your first Titanic Submission (Beginners guide)**

In [None]:
# Libraries import 
import pandas as pd
from sklearn.model_selection import train_test_split  
# This imports the train_test_split function from the sklearn.model_selection module (part of the Scikit-learn library).
# Why it's needed: The function is used to split your dataset into training and testing (or validation) sets. This is crucial because you need separate data to train your model and then evaluate its performance on unseen data to avoid overfitting.
from sklearn.ensemble import RandomForestClassifier
#This imports the RandomForestClassifier from the sklearn.ensemble module. A random forest is a machine learning algorithm that works by building multiple decision trees and combining their results to make a final prediction.
#Why it's needed: Random forests are used because they are robust, handle both classification and regression tasks, and are effective at reducing overfitting compared to individual decision trees.

from sklearn.metrics import accuracy_score
# This imports the accuracy_score function from the sklearn.metrics module. It measures the accuracy of a machine learning model by comparing the predicted values with the actual values.
# Why it's needed: accuracy_score is used to evaluate the performance of your model. It tells you the percentage of correct predictions made by the model, which is especially useful for classification tasks like predicting survival on the Titanic dataset.


In [None]:
# Data Download
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')



In [None]:
# Обзор данных
print(train_data.head(10))



In [None]:
# Удаление ненужных колонок (например, Name, Ticket, Cabin)
train_data = train_data.drop(['Name', 'Ticket', 'Cabin'], axis=1)
test_data = test_data.drop(['Name', 'Ticket', 'Cabin'], axis=1)



In [None]:
# Заполнение пропущенных значений напрямую
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].median())
train_data['Embarked'] = train_data['Embarked'].fillna(train_data['Embarked'].mode()[0])

test_data['Age'] = test_data['Age'].fillna(test_data['Age'].median())
test_data['Fare'] = test_data['Fare'].fillna(test_data['Fare'].median())

In [None]:
# Преобразование категориальных данных в числовые
train_data = pd.get_dummies(train_data, columns=['Sex', 'Embarked'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Sex', 'Embarked'], drop_first=True)



In [None]:
# Определение признаков и целевой переменной
X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

# Разделение данных на тренировочную и тестовую выборки
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
# Инициализация и обучение модели
model = RandomForestClassifier(n_estimators= 80, max_depth= 80, min_samples_split=10, min_samples_leaf=5, random_state=42)
model.fit(X_train, y_train)

# Предсказание на валидационной выборке
y_pred = model.predict(X_val)

# Оценка точности модели
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy}")

# Предсказание на тестовой выборке
test_predictions = model.predict(test_data)



In [None]:
# Создание файла для загрузки на Kaggle
output = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': test_predictions})
output.to_csv('submission.csv', index=False)
print("Файл submission.csv создан.")

In [None]:
output.to_csv('/kaggle/working/submission.csv', index=False)