In [61]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'titanic:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F3136%2F26502%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240712%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240712T060946Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D8d54dfd21b3a07fce4d55abe5fa773b0a4dc0847dfd82a8aed55b1cca1031505fc47ee9d235a789aacc56854b1dae793650ea4dea932692bfab937cf187ae1099e9721737d0bf43d6c8a4150c85e71e3cd4055b8cd1808d02ff2c5e1e9a2972164e2770084002ca3fb8b82725f6182930305beb5bf433a42e1dc0b1b11ef94dca92f4ae633ddb25d70d9aca165216291ac228eda46f3223fcca33174d28ec36e05eeb305d7c30cb3ee802682b3c24fab45b6eeefa437ffabf4dda84f656e82b9892a3ac786ffd49dd9741736de22690300d21f86a10f4c02968f68e55318fc68266714ba0d21aec2d88659a05390cd4b1ae2a4f0c5fa693c76b60506e07ca08b'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading titanic, 34877 bytes compressed
Downloaded and uncompressed: titanic
Data source import complete.


In [62]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [63]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
sns.set()

train1 = pd.read_csv('/kaggle/input/titanic/train.csv')
test1 = pd.read_csv('/kaggle/input/titanic/test.csv')

train = train1.copy()
test = test1.copy()

In [64]:
print(train.info())
print()
print(test.info())
print()
print(train.isnull().sum())
print()
print(test.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pc

# 결측값

< train >
 - Age : 177
 - Cabin : 687
 - Embarked : 2

< test >
 - Age : 86
 - Fare : 1
 - Cabin : 327

In [65]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [66]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# **직업(Title)추출 후 Age 결측값 처리**


In [67]:
train = train1

train_test_data = [train, test]

for two_data in train_test_data:
    two_data['Title'] = two_data['Name'].str.extract(' ([A-Za-z]+)\.', expand = False)

title_mapping = {
    'Mr': 0, 'Rev': 0, 'Don': 0, 'Capt': 0, 'Jonkheer': 0,
    'Miss': 1, 'Ms': 1,
    'Mrs': 2, 'Lady': 2, 'Dona': 2, 'Mme': 2, 'Countess': 2,
    'Master': 3, 'Dr': 3, 'Mlle': 3,
    'Col': 4, 'Major': 4, 'Sir': 4
}

for two_data in train_test_data:
    two_data['Title'] = two_data['Title'].map(title_mapping)

train['Age'].fillna(train.groupby('Title')['Age'].transform('mean'), inplace=True)
test['Age'].fillna(test.groupby('Title')['Age'].transform('mean'), inplace=True)
train.drop('Name', axis=1, inplace=True)
test.drop('Name', axis=1, inplace=True)

In [68]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,2
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S,2
4,5,0,3,male,35.0,0,0,373450,8.05,,S,0


In [69]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,892,3,male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,female,47.0,1,0,363272,7.0,,S,2
2,894,2,male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,male,27.0,0,0,315154,8.6625,,S,0
4,896,3,female,22.0,1,1,3101298,12.2875,,S,2


In [70]:
# Age를 0,1,2,3,4 로 바꾸기.


# for two_data in train_test_data:
#     two_data.loc[two_data['Age'] <=18, 'Age']=0
#     two_data.loc[(two_data['Age']>18)&(two_data['Age']<=35), 'Age']=1
#     two_data.loc[(two_data['Age']>35)&(two_data['Age']<=45), 'Age']=2
#     two_data.loc[(two_data['Age']>45)&(two_data['Age']<=60), 'Age']=3
#     two_data.loc[(two_data['Age']>60), 'Age']=4

# **Embarked 결측값 처리**
 - S가 가장 많으므로 그냥 2개 없는건 S로 처리

In [71]:
for two_data in train_test_data:
    two_data['Embarked'] = two_data['Embarked'].fillna('S')

# 승선지 매핑
embarked_mapping = {'S':0, 'C':1, 'Q':2}

for two_data in train_test_data:
    two_data['Embarked'] = two_data['Embarked'].map(embarked_mapping)

# 성별 매핑
sex_mapping = {'male':0, 'female':1}
for two_data in train_test_data:
    two_data['Sex']=two_data['Sex'].map(sex_mapping)

In [72]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,0,22.0,1,0,A/5 21171,7.25,,0,0
1,2,1,1,1,38.0,1,0,PC 17599,71.2833,C85,1,2
2,3,1,3,1,26.0,0,0,STON/O2. 3101282,7.925,,0,1
3,4,1,1,1,35.0,1,0,113803,53.1,C123,0,2
4,5,0,3,0,35.0,0,0,373450,8.05,,0,0


In [73]:
train['Embarked'].value_counts()

Embarked
0    646
1    168
2     77
Name: count, dtype: int64

# **Fare(요금) 결측치 중앙값으로 대체**

In [74]:
train['Fare'].fillna(train.groupby('Pclass')['Fare'].transform('median'), inplace=True)
test['Fare'].fillna(test.groupby('Pclass')['Fare'].transform('median'), inplace=True)

In [75]:
# Fare를 0,1,2,3,4로 분류

# for two_data in train_test_data:
#     two_data.loc[two_data['Fare'] <=5, 'Fare']=0
#     two_data.loc[(two_data['Fare']>5)&(two_data['Fare']<=15), 'Fare']=1
#     two_data.loc[(two_data['Fare']>15)&(two_data['Fare']<=30), 'Fare']=2
#     two_data.loc[(two_data['Fare']>30)&(two_data['Fare']<=100), 'Fare']=3
#     two_data.loc[(two_data['Fare']>100), 'Fare']=4

# **Cabin 결측치 대치 및 매핑**

In [76]:
for two_data in train_test_data:
    two_data['Cabin']=two_data['Cabin'].str[:1]

train['Cabin'].value_counts()

Cabin
C    59
B    47
D    33
E    32
A    15
F    13
G     4
T     1
Name: count, dtype: int64

In [77]:
cabin_mapping = {'A':2, 'B':2, 'C':2, 'T':2,
                'D':1, 'G':1,
                'E':0, 'F':0}

for two_data in train_test_data:
    two_data['Cabin'] = two_data['Cabin'].map(cabin_mapping)

In [78]:
train['Cabin'].fillna(train.groupby('Pclass')['Cabin'].transform('median'), inplace=True)
test['Cabin'].fillna(test.groupby('Pclass')['Cabin'].transform('median'), inplace=True)

# **Familysize 구하기. Sibsp + Parch + 1**

In [79]:
train['Familysize'] = train['SibSp'] + train['Parch']+1
test['Familysize'] = train['SibSp'] + test['Parch']+1

In [80]:
# Familysize 0,1,2,3으로 분류

# for two_data in train_test_data:
#     two_data.loc[two_data['Familysize'] <=1, 'Familysize']=0
#     two_data.loc[(two_data['Familysize']>1)&(two_data['Familysize']<=2), 'Familysize']=1
#     two_data.loc[(two_data['Familysize']>2)&(two_data['Familysize']<=5), 'Familysize']=2
#     two_data.loc[(two_data['Familysize']>5), 'Familysize']=3

In [81]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Familysize
0,1,0,3,0,22.0,1,0,A/5 21171,7.25,0.0,0,0,2
1,2,1,1,1,38.0,1,0,PC 17599,71.2833,2.0,1,2,2
2,3,1,3,1,26.0,0,0,STON/O2. 3101282,7.925,0.0,0,1,1
3,4,1,1,1,35.0,1,0,113803,53.1,2.0,0,2,2
4,5,0,3,0,35.0,0,0,373450,8.05,0.0,0,0,1


In [82]:
# 결측값 처리 완
train.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
Title          0
Familysize     0
dtype: int64

In [84]:
feature_drop = ['Ticket', 'SibSp', 'Parch']
train = train.drop(feature_drop, axis=1)
train = train.drop('PassengerId', axis=1)

test = test.drop(feature_drop, axis=1)

train_x = train.drop('Survived', axis =1)
train_y = train['Survived']

In [85]:
train_x.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Cabin,Embarked,Title,Familysize
0,3,0,22.0,7.25,0.0,0,0,2
1,1,1,38.0,71.2833,2.0,1,2,2
2,3,1,26.0,7.925,0.0,0,1,1
3,1,1,35.0,53.1,2.0,0,2,2
4,3,0,35.0,8.05,0.0,0,0,1


In [86]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

K_clf = KNeighborsClassifier(n_neighbors =50)
scoring1 = 'accuracy'
score = cross_val_score(K_clf, train_x,  train_y, cv=k_fold, scoring = scoring1)
print(round(np.mean(score)*100,2))

69.36


In [87]:
d_clf = DecisionTreeClassifier()
scoring1 = 'accuracy'
score = cross_val_score(d_clf, train_x,  train_y, cv=k_fold, scoring = scoring1)
print(round(np.mean(score)*100,2))

78.23


In [88]:
r_clf = RandomForestClassifier(n_estimators =15)
scoring1 = 'accuracy'
score = cross_val_score(r_clf, train_x,  train_y, cv=k_fold, scoring = scoring1)
print(round(np.mean(score)*100,2))

81.37


In [89]:
g_clf = GaussianNB()
scoring1 = 'accuracy'
score = cross_val_score(g_clf, train_x,  train_y, cv=k_fold, scoring = scoring1)
print(round(np.mean(score)*100,2))

79.68


In [90]:
s_clf = SVC()
scoring1 = 'accuracy'
score = cross_val_score(s_clf, train_x,  train_y, cv=k_fold, scoring = scoring1)
print(round(np.mean(score)*100,2))

67.9


In [94]:
s_clf.fit(train_x, train_y)

test_x=test.drop('PassengerId', axis=1).copy()
prediction = s_clf.predict(test_x)

# submission = pd.DataFrame({'PassengerId':test['PassengerId'], 'Survived':prediction})
# submission.to_csv('submission.csv', index=False)

In [91]:
from xgboost import XGBClassifier
x_clf = XGBClassifier(n_estimators = 10, random_state = 123)
scoring1 = 'accuracy'
score = cross_val_score(x_clf, train_x, train_y,cv=k_fold, scoring=scoring1)
print(round(np.mean(score) * 100 , 2))

84.4


In [95]:
x_model = XGBClassifier(n_estimators=20, random_state=123)
x_model.fit(train_x, train_y)

pred = x_model.predict_proba(test_x)[:,1]

pred_label = np.where(pred>0.5, 1, 0)

# submission = pd.DataFrame({'PassengerId' : test['PassengerId'], 'Survived' : pred_label})
# submission.to_csv('submission_xgboost_chang_hyeon_park.csv',index=False)

In [15]:
train['Age'].value_counts().sort_index(ascending=False)

Age
80.00    1
74.00    1
71.00    2
70.50    1
70.00    2
        ..
0.92     1
0.83     2
0.75     2
0.67     1
0.42     1
Name: count, Length: 88, dtype: int64

In [27]:
# 18세 이하의 사람들 보기 위한 코드

# # Filter rows where Age is 1 or less
# young_children = train[train['Age'] <= 18]

# # Print the filtered DataFrame
# young_children

In [26]:
# Filter rows where Age is 18 or less
young_children = train[train['Age'] <= 18]

# Count the number of NaN values in Cabin column
nan_cabin_count = young_children['Cabin'].isnull().sum()

# Count the number of non-NaN values in Cabin column
non_nan_cabin_count = young_children['Cabin'].notnull().sum()

# Print the counts
print("Number of young children with NaN in Cabin:", nan_cabin_count)
print("Number of young children with non-NaN in Cabin:", non_nan_cabin_count)


Number of young children with NaN in Cabin: 116
Number of young children with non-NaN in Cabin: 23


In [17]:
"# Filter rows where Age is strictly less than 1
infants = train[train['Age'] < 1]

# Get the PassengerIds of infants
infant_ids = infants['PassengerId'].tolist()

# Filter rows where Parch is greater than 0 and PassengerId is in infant_ids
parents_of_infants = train[(train['Parch'] > 0) & (train['PassengerId'].isin(infant_ids))]

# Print the parents of infants
print(parents_of_infants[['PassengerId', 'Name', 'Age', 'Parch']])


     PassengerId                             Name   Age  Parch
78            79    Caldwell, Master. Alden Gates  0.83      2
305          306   Allison, Master. Hudson Trevor  0.92      2
469          470    Baclini, Miss. Helene Barbara  0.75      1
644          645           Baclini, Miss. Eugenie  0.75      1
755          756        Hamalainen, Master. Viljo  0.67      1
803          804  Thomas, Master. Assad Alexander  0.42      1
831          832  Richards, Master. George Sibley  0.83      1


In [18]:
# Filter rows where Age is less than 18 (children)
children = train[train['Age'] < 18]

# Get the PassengerIds of children
child_ids = children['PassengerId'].tolist()

# Filter rows where Parch is greater than 0 and PassengerId is in child_ids
parents_of_children = train[(train['Parch'] > 0) & (train['PassengerId'].isin(child_ids))]

# Print the parents or guardians of children
print(parents_of_children[['PassengerId', 'Name', 'Age', 'Parch']])


     PassengerId                                      Name    Age  Parch
7              8            Palsson, Master. Gosta Leonard   2.00      1
10            11           Sandstrom, Miss. Marguerite Rut   4.00      1
16            17                      Rice, Master. Eugene   2.00      1
24            25             Palsson, Miss. Torborg Danira   8.00      1
43            44  Laroche, Miss. Simonne Marie Anne Andree   3.00      2
..           ...                                       ...    ...    ...
831          832           Richards, Master. George Sibley   0.83      1
850          851   Andersson, Master. Sigvard Harald Elias   4.00      2
852          853                   Boulos, Miss. Nourelain   9.00      1
853          854                 Lines, Miss. Mary Conover  16.00      1
869          870           Johnson, Master. Harold Theodor   4.00      1

[81 rows x 4 columns]


In [20]:
train['Age'].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [13]:
train['Cabin'].value_counts()

Cabin
B96 B98        4
G6             4
C23 C25 C27    4
C22 C26        3
F33            3
              ..
E34            1
C7             1
C54            1
E36            1
C148           1
Name: count, Length: 147, dtype: int64

In [24]:
train['PassengerId'].duplicated().any()
# PassengerId는 중복된 값이 없음

False