In [None]:
import pandas as pd

test_df = pd.read_csv('/content/test.csv')
train_df = pd.read_csv('/content/train.csv')

test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [None]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
gender_submission = pd.read_csv('/content/gender_submission.csv')
gender_submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [None]:

train_df['Age'] = train_df['Age'].fillna(train_df['Age'].mean())

In [None]:
female = train_df.loc[train_df.Sex == 'female']["Survived"]
male = train_df.loc[train_df.Sex == 'male']["Survived"]

print(f'female : {sum(female) / len(female)}')
print(f'male : {sum(male) / len(male)}')

female : 0.7420382165605095
male : 0.18890814558058924


In [None]:
first_class = train_df.loc[train_df.Pclass == 1]["Survived"]
second_class = train_df.loc[train_df.Pclass == 2]["Survived"]
third_class = train_df.loc[train_df.Pclass == 3]["Survived"]

print(f'first class = {sum(first_class) / len(first_class)}')
print(f'second class = {sum(second_class) / len(second_class)}')
print(f'third class = {sum(third_class) / len(third_class)}')

first class = 0.6296296296296297
second class = 0.47282608695652173
third class = 0.24236252545824846


In [None]:
age_under50 = train_df.loc[train_df.Age < 50]["Survived"]
age_over50 = train_df.loc[train_df.Age >= 50]["Survived"]

print(f'under 50 = {sum(age_under50) / len(age_under50)}')
print(f'over 50 = {sum(age_over50) / len(age_over50)}')

under 50 = 0.38555691554467564
over 50 = 0.36486486486486486


In [None]:
sibsp = train_df.loc[train_df.SibSp >= 1]['Survived']

print(f'relation of siblings on board {sum(sibsp) / len(sibsp)}')

relation of siblings on board 0.4664310954063604


In [None]:
parent = train_df.loc[train_df.Parch >= 1]['Survived']

print(f'relation of parents on board {sum(parent) / len(parent)}')

relation of parents on board 0.5117370892018779


In [None]:
from sklearn.ensemble import RandomForestClassifier

y = train_df["Survived"]

features = ["Pclass", "Sex"]
X = pd.get_dummies(train_df[features])
X_test = pd.get_dummies(test_df[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predictions})
output.to_csv('//submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [None]:
print("Missing values in train_df:\n", train_df[['SibSp', 'Parch', 'Fare', 'Embarked']].isnull().sum())
print("\nMissing values in test_df:\n", test_df[['SibSp', 'Parch', 'Fare', 'Embarked']].isnull().sum())

Missing values in train_df:
 SibSp       0
Parch       0
Fare        0
Embarked    2
dtype: int64

Missing values in test_df:
 SibSp       0
Parch       0
Fare        1
Embarked    0
dtype: int64


In [None]:
fare_mean = train_df['Fare'].mean()
train_df['Fare'] = train_df['Fare'].fillna(fare_mean)
test_df['Fare'] = test_df['Fare'].fillna(fare_mean)

print("Missing 'Fare' values in train_df after imputation:", train_df['Fare'].isnull().sum())
print("Missing 'Fare' values in test_df after imputation:", test_df['Fare'].isnull().sum())

Missing 'Fare' values in train_df after imputation: 0
Missing 'Fare' values in test_df after imputation: 0


In [None]:
embarked_mode = train_df['Embarked'].mode()[0]
train_df['Embarked'] = train_df['Embarked'].fillna(embarked_mode)
test_df['Embarked'] = test_df['Embarked'].fillna(embarked_mode)

print("Missing 'Embarked' values in train_df after imputation:", train_df['Embarked'].isnull().sum())
print("Missing 'Embarked' values in test_df after imputation:", test_df['Embarked'].isnull().sum())

Missing 'Embarked' values in train_df after imputation: 0
Missing 'Embarked' values in test_df after imputation: 0


In [None]:
features = ["Pclass", "Sex", "SibSp", "Parch", "Embarked"]
X_train_encoded = pd.get_dummies(train_df[features], columns=['Sex', 'Embarked'])
X_test_encoded = pd.get_dummies(test_df[features], columns=['Sex', 'Embarked'])

print("Shape of X_train_encoded:", X_train_encoded.shape)
print("Shape of X_test_encoded:", X_test_encoded.shape)
print("Columns of X_train_encoded:\n", X_train_encoded.columns)
print("Columns of X_test_encoded:\n", X_test_encoded.columns)

Shape of X_train_encoded: (891, 8)
Shape of X_test_encoded: (418, 8)
Columns of X_train_encoded:
 Index(['Pclass', 'SibSp', 'Parch', 'Sex_female', 'Sex_male', 'Embarked_C',
       'Embarked_Q', 'Embarked_S'],
      dtype='object')
Columns of X_test_encoded:
 Index(['Pclass', 'SibSp', 'Parch', 'Sex_female', 'Sex_male', 'Embarked_C',
       'Embarked_Q', 'Embarked_S'],
      dtype='object')


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

model_rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)

# Perform 5-fold cross-validation
cv_scores = cross_val_score(model_rf, X_train_encoded, y, cv=5)

print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean():.4f}")
print(f"Standard deviation of cross-validation scores: {cv_scores.std():.4f}")

Cross-validation scores: [0.80446927 0.79775281 0.81460674 0.79213483 0.8258427 ]
Mean cross-validation score: 0.8070
Standard deviation of cross-validation scores: 0.0120


In [None]:
model_rf.fit(X_train_encoded, y)
predictions_rf = model_rf.predict(X_test_encoded)

output_rf = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predictions_rf})
output_rf.to_csv('submission_random_forest_expanded.csv', index=False)
print("Your RandomForestClassifier submission with expanded features was successfully saved!")

Your RandomForestClassifier submission with expanded features was successfully saved!
