In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [None]:
df_test['Survived'] = [0] * len(df_test)

In [None]:
df = pd.concat([df_train, df_test], ignore_index=True)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1306,0,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     1309 non-null   int64  
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(2), int64(5), object(5)
memory usage: 122.8+ KB


In [None]:
df = df.drop('Name', axis=1)

In [None]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,0.261268,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.439494,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


# PREPROCESSING:

In [None]:
df['Sex'] = df['Sex'].map({'male' : 1, "female" : 0})

In [None]:
df = pd.get_dummies(df, columns=['Sex'])

In [None]:
df.rename(columns={'Sex_0':'Women', 'Sex_1':'Men'}, inplace=True)

In [None]:
df['Women'] = df['Women'].astype(int)
df['Men'] = df['Men'].astype(int)

In [None]:
df = df[df['Embarked'].notna()]

In [None]:
df = pd.get_dummies(df, columns=['Embarked'])

In [None]:
df['Embarked_C'] = df['Embarked_C'].astype(int)
df['Embarked_Q'] = df['Embarked_Q'].astype(int)
df['Embarked_S'] = df['Embarked_S'].astype(int)

In [None]:
df = df.drop('Ticket', axis = 1)

In [None]:
df['Age'] = df['Age'].fillna(df['Age'].mean()).astype(int)

In [None]:
df.corr(numeric_only=True)["Survived"].sort_values(ascending = False)

Unnamed: 0,Survived
Survived,1.0
Women,0.401985
Fare,0.171809
Embarked_C,0.098057
Parch,0.056183
Embarked_Q,-0.011928
SibSp,-0.013174
Age,-0.062383
Embarked_S,-0.078978
Pclass,-0.241672


In [None]:
df['Deck'] = df['Cabin'].str[0]

In [None]:
df['Cabin_Number'] = df['Cabin'].str[1:]

In [None]:
df['Deck'].fillna('U', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Deck'].fillna('U', inplace=True)


In [None]:
imputer = KNNImputer(n_neighbors=5)

In [None]:
df['Cabin_Number'] = df['Cabin_Number'].str.extract('(\d+)')

In [None]:
df['Cabin_Number'] = pd.to_numeric(df['Cabin_Number'], errors='coerce')

In [None]:
df['Cabin_Number'] = imputer.fit_transform(df[['Cabin_Number']])

In [None]:
df['Cabin_Number'] = df['Cabin_Number'].astype(int)

In [None]:
df.drop('Cabin', axis=1, inplace=True)

In [None]:
df = pd.get_dummies(df, columns = ['Deck'])

In [None]:
df['Deck_A'] = df['Deck_B'].astype(int)
df['Deck_B'] = df['Deck_B'].astype(int)
df['Deck_C'] = df['Deck_C'].astype(int)
df['Deck_D'] = df['Deck_D'].astype(int)
df['Deck_E'] = df['Deck_E'].astype(int)
df['Deck_F'] = df['Deck_F'].astype(int)
df['Deck_G'] = df['Deck_G'].astype(int)
df['Deck_T'] = df['Deck_T'].astype(int)
df['Deck_U'] = df['Deck_U'].astype(int)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1307 entries, 0 to 1308
Data columns (total 22 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   1307 non-null   int64  
 1   Survived      1307 non-null   int64  
 2   Pclass        1307 non-null   int64  
 3   Age           1307 non-null   int64  
 4   SibSp         1307 non-null   int64  
 5   Parch         1307 non-null   int64  
 6   Fare          1306 non-null   float64
 7   Women         1307 non-null   int64  
 8   Men           1307 non-null   int64  
 9   Embarked_C    1307 non-null   int64  
 10  Embarked_Q    1307 non-null   int64  
 11  Embarked_S    1307 non-null   int64  
 12  Cabin_Number  1307 non-null   int64  
 13  Deck_A        1307 non-null   int64  
 14  Deck_B        1307 non-null   int64  
 15  Deck_C        1307 non-null   int64  
 16  Deck_D        1307 non-null   int64  
 17  Deck_E        1307 non-null   int64  
 18  Deck_F        1307 non-null   int

In [None]:
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

In [None]:
train_df = df[:len(df_train) - 2]
test_df = df[len(df_train) - 2:]
test_df = test_df.drop('Survived', axis=1)

In [None]:
train_df = train_df.drop('Deck_U', axis = 1)
test_df = test_df.drop('Deck_U', axis = 1)

# PREDICTIONS!

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [None]:
X = train_df.drop('Survived', axis=1)
y = train_df['Survived']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
y_train = np.reshape(y_train, (-1, 1))

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
model_LogR = LogisticRegression()
model_LogR.fit(X_train, y_train)

In [None]:
y_pred_LogR = model_LogR.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred_LogR)
print(f'Logistic Regression Accuracy: {accuracy * 100:.2f}%')

Logistic Regression Accuracy: 80.34%


In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f'Random Forest Accuracy: {rf_accuracy * 100:.2f}%')

  return fit_method(estimator, *args, **kwargs)


Random Forest Accuracy: 78.09%


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
# Initialize the Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)

# Train the model
gb_model.fit(X_train, y_train)

# Make predictions
y_pred_gb = gb_model.predict(X_test)

# Evaluate the model
gb_accuracy = accuracy_score(y_test, y_pred_gb)
print(f'Gradient Boosting Accuracy: {gb_accuracy * 100:.2f}%')

  y = column_or_1d(y, warn=True)


Gradient Boosting Accuracy: 81.46%


In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_model = KNeighborsClassifier()

# Train the model
knn_model.fit(X_train, y_train)

# Make predictions
y_pred_knn = knn_model.predict(X_test)

# Evaluate the model
knn_accuracy = accuracy_score(y_test, y_pred_knn)
print(f'K-Nearest Neighbors Accuracy: {knn_accuracy * 100:.2f}%')

K-Nearest Neighbors Accuracy: 60.11%


  return self._fit(X, y)


In [None]:
import xgboost as xgb

In [None]:
xgb_model = xgb.XGBClassifier(random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
print(f'XGBoost Accuracy: {xgb_accuracy * 100:.2f}%')

XGBoost Accuracy: 78.65%


In [None]:
import lightgbm as lgb

In [None]:
lgb_model = lgb.LGBMClassifier(random_state=42)

# Train the model
lgb_model.fit(X_train, y_train)

# Make predictions
y_pred_lgb = lgb_model.predict(X_test)

In [None]:
# Evaluate the model
lgb_accuracy = accuracy_score(y_test, y_pred_lgb)
print(f'LightGBM Accuracy: {lgb_accuracy * 100:.2f}%')

LightGBM Accuracy: 79.21%


In [None]:
pred = gb_model.predict(test_df)
output = pd.DataFrame({'PassengerId': test_df['PassengerId'],
                       'Survived': pred})
output.to_csv('submission.csv', index=False)

Unnamed: 0,PassengerId,Survived
893,894,0
894,895,0
895,896,1
896,897,0
897,898,1
...,...,...
1304,1305,0
1305,1306,1
1306,1307,0
1307,1308,0


### **USING NEURAL NETWORKS**


In [None]:
import tensorflow as tf

In [None]:
numerical_columns = ['Age', 'Fare', 'SibSp', 'Parch']
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

In [None]:
X = df.drop('Survived', axis=1)
y = df['Survived']

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1, random_state=42)

In [None]:
# Check the sizes of the splits
print(f"Training Set: {X_train.shape[0]} samples")
print(f"Validation Set: {X_val.shape[0]} samples")
print(f"Test Set: {X_test.shape[0]} samples")

Training Set: 1058 samples
Validation Set: 118 samples
Test Set: 131 samples


In [None]:
input_size = 20
output_size = 1
hidden_size = 128

model = tf.keras.Sequential([
    tf.keras.Input(shape = (input_size,)),
    tf.keras.layers.Dense(hidden_size, activation= 'relu'),
    tf.keras.layers.Dense(hidden_size, activation= 'relu'),
    tf.keras.layers.Dense(hidden_size, activation= 'relu'),
    tf.keras.layers.Dense(output_size, activation= 'sigmoid'),
])

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005)
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(lambda epoch: 0.001 * 0.95 ** epoch)
model.compile(optimizer=optimizer, loss = 'binary_crossentropy', metrics= ['accuracy'])

In [None]:
epochs = 20
model.fit(X_train,
          y_train,
          epochs=epochs,
          validation_data=(X_val, y_val),
          callbacks=[early_stopping, lr_scheduler],
          verbose = 2)

Epoch 1/20
34/34 - 4s - 111ms/step - accuracy: 0.6919 - loss: 0.6554 - val_accuracy: 0.7627 - val_loss: 0.5319 - learning_rate: 0.0010
Epoch 2/20
34/34 - 0s - 11ms/step - accuracy: 0.7505 - loss: 0.5578 - val_accuracy: 0.7712 - val_loss: 0.4904 - learning_rate: 9.5000e-04
Epoch 3/20
34/34 - 0s - 7ms/step - accuracy: 0.7448 - loss: 0.5429 - val_accuracy: 0.7627 - val_loss: 0.5780 - learning_rate: 9.0250e-04
Epoch 4/20
34/34 - 0s - 6ms/step - accuracy: 0.7476 - loss: 0.5206 - val_accuracy: 0.7966 - val_loss: 0.4449 - learning_rate: 8.5737e-04
Epoch 5/20
34/34 - 0s - 9ms/step - accuracy: 0.7552 - loss: 0.5004 - val_accuracy: 0.8729 - val_loss: 0.4242 - learning_rate: 8.1451e-04
Epoch 6/20
34/34 - 0s - 9ms/step - accuracy: 0.7524 - loss: 0.5049 - val_accuracy: 0.8559 - val_loss: 0.4248 - learning_rate: 7.7378e-04
Epoch 7/20
34/34 - 0s - 11ms/step - accuracy: 0.7741 - loss: 0.4721 - val_accuracy: 0.8051 - val_loss: 0.4733 - learning_rate: 7.3509e-04
Epoch 8/20
34/34 - 1s - 15ms/step - accur

<keras.src.callbacks.history.History at 0x793d9da887c0>

In [None]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7460 - loss: 0.4808 
Test Loss: 0.4936150014400482
Test Accuracy: 0.7404580116271973
