In [79]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

In [91]:
# train = pd.read_csv("https://raw.githubusercontent.com/AMOGHA1140/kaggle_contents/main/titanic/train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/AMOGHA1140/kaggle_contents/main/titanic/test.csv")


In [92]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Age          889 non-null    float64
 4   SibSp        889 non-null    float64
 5   Parch        889 non-null    int64  
 6   Fare         889 non-null    float64
 7   Sex_female   889 non-null    bool   
 8   Sex_male     889 non-null    bool   
 9   Embarked_C   889 non-null    bool   
 10  Embarked_Q   889 non-null    bool   
 11  Embarked_S   889 non-null    bool   
dtypes: bool(5), float64(3), int64(4)
memory usage: 59.9 KB


In [93]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,889.0,889.0,889.0,889.0,889.0,889.0,889.0
mean,446.0,0.382452,2.311586,2.717486e-16,-1.44925,0.382452,-2.882334e-16
std,256.998173,0.48626,0.8347,1.000563,1.283687,0.806761,1.000563
min,1.0,0.0,1.0,-2.226536,-2.302585,0.0,-3.059355
25%,224.0,0.0,2.0,-0.5636741,-2.302585,0.0,-0.7996695
50%,446.0,0.0,3.0,-0.1013399,-2.302585,0.0,-0.2286411
75%,668.0,1.0,3.0,0.4380499,0.09531,0.0,0.5238932
max,891.0,1.0,3.0,3.905556,2.091864,6.0,3.393172


In [94]:
train = pd.read_csv("https://raw.githubusercontent.com/AMOGHA1140/kaggle_contents/main/titanic/train.csv")

def preprocess(data):

  #drop ticket and name as they're useless
  data = data.drop(['Ticket', 'Name', 'Cabin'], axis=1)

  #remove the only 2 existing null values from Embarked
  data = data.dropna(subset=['Embarked'])

  #fill empty values of age and fare
  data['Age'] = data['Age'].fillna(data['Age'].median())
  data['Fare'].fillna(data['Fare'].median(), inplace=True)


  #one hot encode Sex and Embarked
  encoded_data = pd.get_dummies(data[['Sex', 'Embarked']])

  data = pd.concat([data, encoded_data], axis=1)
  data = data.drop(['Sex', 'Embarked'], axis=1)


  #normalize into gaussian distribution
  # data['Fare'] = np.log(data['Fare'] + 1)
  # data['SibSp'] = np.log(data['SibSp'] + 0.1)

  #scale the data
  scaler = StandardScaler()
  data['Age'] = scaler.fit_transform(data[['Age']])
  data['Fare'] = scaler.fit_transform(data[['Fare']])

  return data


train = preprocess(train)

# test = preprocess(test)
# print(train.columns)
train.head()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Fare'].fillna(data['Fare'].median(), inplace=True)


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,-0.563674,1,0,-0.50024,False,True,False,False,True
1,2,1,1,0.669217,1,0,0.788947,True,False,True,False,False
2,3,1,3,-0.255451,0,0,-0.48665,True,False,False,False,True
3,4,1,1,0.43805,1,0,0.422861,True,False,False,False,True
4,5,0,3,0.43805,0,0,-0.484133,False,True,False,False,True


In [95]:
X_train, X_test, y_train, y_test = train_test_split(train.drop(['Survived', 'PassengerId'], axis=1), train['Survived'], test_size=0.2, random_state=42)

In [96]:
model = LogisticRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
accuracy_score(y_test, predictions)



0.7752808988764045

In [76]:
test = pd.read_csv("https://raw.githubusercontent.com/AMOGHA1140/kaggle_contents/main/titanic/test.csv")

test = preprocess(test)
submissions = model.predict(test.drop('PassengerId', axis=1))


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Fare'].fillna(data['Fare'].median(), inplace=True)


In [77]:
df = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': submissions})
df.to_csv('submission2.csv', index=False)

In [97]:
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy

In [104]:
model = Sequential([
    Dense(128, activation='relu', kernel_regularizer=l2),
    Dropout(0.5),

    Dense(32, activation='relu', kernel_regularizer=l2),
    Dropout(0.5),

    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=Adam(learning_rate=1e-4),
    loss=BinaryCrossentropy(),
    metrics=['accuracy']
)

model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)












Epoch 1/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - accuracy: 0.5523 - loss: 1.4114 - val_accuracy: 0.6643 - val_loss: 1.3176
Epoch 2/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6027 - loss: 1.3472 - val_accuracy: 0.6643 - val_loss: 1.3032
Epoch 3/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6080 - loss: 1.3473 - val_accuracy: 0.6643 - val_loss: 1.2889
Epoch 4/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6153 - loss: 1.3402 - val_accuracy: 0.6643 - val_loss: 1.2753
Epoch 5/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5909 - loss: 1.3351 - val_accuracy: 0.6643 - val_loss: 1.2625
Epoch 6/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5956 - loss: 1.3159 - val_accuracy: 0.6643 - val_loss: 1.2497
Epoch 7/100
[1m18/18[0m [32m━━

<keras.src.callbacks.history.History at 0x7dabce38add0>

In [105]:
submissions = model.predict(X_test)
submissions = np.where(submissions > 0.5, 1, 0)

accuracy_score(y_test, submissions)
#0.7808988764044944



[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step


0.8202247191011236

In [107]:
test = pd.read_csv("https://raw.githubusercontent.com/AMOGHA1140/kaggle_contents/main/titanic/test.csv")
test = preprocess(test)
submission = model.predict(test.drop('PassengerId', axis=1))
submission = np.where(submission > 0.5, 1, 0)
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': submission.flatten()})
submission.to_csv('submission3.csv', index=False)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Fare'].fillna(data['Fare'].median(), inplace=True)


In [78]:
# def preprocess(data):

#   #remove useless data
#   try:
#     data.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)
#   except:
#     None

#   #one-hot-encode Sex and Embarked
#   # enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
#   encoded_data = pd.get_dummies(data[['Sex', 'Embarked']])
#   data = pd.concat([data, encoded_data], axis=1)
#   data.drop(['Sex', 'Embarked'], axis=1, inplace=True)
#   # data['Embarked'] = enc.fit_transform(data[['Embarked']])

#   #scale
#   data['Age'].fillna(data['Age'].median(), inplace=True)
#   data['Fare'].fillna(data['Fare'].median(), inplace=True)

#   scaler = StandardScaler()
#   data['Age'] = scaler.fit_transform(data[['Age']])
#   data['Fare'] = scaler.fit_transform(data[['Fare']])

#   return data



# train = preprocess(train)
# # train.describe()
# train.head()
# # train.columns'

KeyError: "None of [Index(['Sex', 'Embarked'], dtype='object')] are in the [columns]"

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train.drop(['Survived', 'PassengerId'], axis=1), train['Survived'], test_size=0.2, random_state=42)

In [None]:
model1 = LogisticRegression()
model1.fit(X_train, y_train)
predictions = model1.predict(X_test)

print(len(predictions))

179


In [None]:
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(f"F1 Score: {f1_score(y_test, predictions)}")

Accuracy: 0.8100558659217877
F1 Score: 0.7638888888888888


In [None]:
test = preprocess(test)
test.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Age'].fillna(data['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Fare'].fillna(data['Fare'].mean(), inplace=True)


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,892,3,0.334993,0,0,-0.498407,False,True,False,True,False
1,893,3,1.32553,1,0,-0.513274,True,False,False,False,True
2,894,2,2.514175,0,0,-0.465088,False,True,False,True,False
3,895,3,-0.25933,0,0,-0.483466,False,True,False,False,True
4,896,3,-0.655545,1,1,-0.418471,True,False,False,False,True


In [None]:
submission = model1.predict(test.drop('PassengerId', axis=1))

submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': submission})
submission.to_csv('submission.csv', index=False)
print(submission)

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         1
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]
