In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as mpl
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data import DataLoader
from torchvision import datasets
import torchvision.transforms as transforms
import os
import time
import sys
import torch.quantization

train_df = pd.read_csv('/home/manami_furukawa/taitanic/train.csv')
test_df = pd.read_csv('/home/manami_furukawa/taitanic/test.csv')

all_df = pd.concat([train_df, test_df],sort=False).reset_index(drop=True)

#敬称で年齢の欠損値を埋める
#name_df = all_df["Name"].str.split("[,.]",2,expand=True)

name_df = all_df["Name"].str.split(",",expand=True)
#print(name_df)
name_df = name_df[1].str.split(".",expand=True)
#print(name_df)

name_df.columns = ["family_name","honorific","name"]

name_df["family_name"] =name_df["family_name"].str.strip()
name_df["honorific"] =name_df["honorific"].str.strip()
name_df["name"] =name_df["name"].str.strip()

all_df = pd.concat([all_df, name_df], axis=1)

all_df[["Age","honorific"]].groupby("honorific").mean()

#苗字敬称のデータを加える
train_df = pd.concat([train_df,name_df[0:len(train_df)].reset_index(drop=True)],axis=1)
test_df = pd.concat([test_df,name_df[0:len(test_df)].reset_index(drop=True)],axis=1)
#敬称ごとの年齢の平均
honorific_age_mean_train = train_df[["honorific","Age"]].groupby("honorific").mean().reset_index()
honorific_age_mean_test = test_df[["honorific","Age"]].groupby("honorific").mean().reset_index()

honorific_age_mean_train.columns = ["honorific","honorific_Age"]
honorific_age_mean_test.columns = ["honorific","honorific_Age"]

#元のデータにマージ
train_df = pd.merge(train_df, honorific_age_mean_train, on="honorific", how="left")
test_df = pd.merge(test_df, honorific_age_mean_test, on="honorific", how="left")
#訓練ようのデータの欠損値に年齢の平均値を入れる
train_df.loc[(train_df["Age"].isnull()), "Age"] = train_df["honorific_Age"]
test_df.loc[(test_df["Age"].isnull()), "Age"] = test_df["honorific_Age"]

#honolific_ageを消す
train_df = train_df.drop(["honorific_Age"],axis=1)
test_df = test_df.drop(["honorific_Age"],axis=1)

#敬称の整理
train_df.loc[~((train_df["honorific"] =="Mr") |
    (train_df["honorific"] =="Miss") |
    (train_df["honorific"] =="Mrs") |
    (train_df["honorific"] =="Master")), "honorific"] = "other"

test_df.loc[~((test_df["honorific"] =="Mr") |
    (test_df["honorific"] =="Miss") |
    (test_df["honorific"] =="Mrs") |
    (test_df["honorific"] =="Master")), "honorific"] = "other"

#print(train_df.columns)


def process_df(df):
    df = df.drop(["PassengerId", "Name", "Ticket", "Cabin","Embarked","family_name","name","honorific"], axis=1)
    df["Age"] = df["Age"].fillna(df["Age"].mean())
    #df = df.replace("Miss", 0)
    #df = df.replace("Mrs", 1)
    #df = df.replace("Master", 2)
    #df = df.replace("Mr", 3)
    #df = df.replace("other", 4)
    df = df.replace("male", 0)
    df = df.replace("female", 1)

    return df

train_df = process_df(train_df)
test_df = process_df(test_df)


class Dataset:
    def __init__(self, df):
        self.df = df
        self.X = self.df.drop(["Survived"], axis=1)
        self.Y = self.df["Survived"]
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        # Convert to numpy array or tensor
        return self.X.iloc[idx,:].values, self.Y.iloc[idx]



train_dataset = Dataset(train_df)

BATCH_SIZE = 32
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

class Net(nn.Module):
    def __init__(self, input_sz, hidden_sz, out_sz):
        super(Net, self).__init__()
        self.f1 = nn.Linear(input_sz, hidden_sz)
        self.f2 = nn.Linear(hidden_sz, out_sz)
        
    def forward(self, x):
        h1 = F.sigmoid(self.f1(x))
        y = self.f2(h1)

        return y

input_sz = 6
hidden_sz = 3
out_sz = 2
net = Net(input_sz, hidden_sz, out_sz)

learning_rate = 0.01
loss_func = nn.MSELoss(reduction="sum")
optimizer = optim.Adam(net.parameters(), lr=learning_rate)
epoch = 32

def convert_label_to_onehot(labels):
    onehot = np.zeros((len(labels), labels.max().item()+1))
    idx = [(i, t.item()) for i, t in enumerate(labels)]
    for i, label in idx:
        onehot[i, label] = 1
    # Convert to float tensor
    return torch.from_numpy(onehot).float()

def train():
    for e in range(epoch):
        for X, labels in train_dataloader:
            T = convert_label_to_onehot(labels)
            y = net(X.float())
            # Use Torch.Tensor(T) instead of Torch.FloatTensor(T)
            loss = loss_func(y, torch.Tensor(T))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


train()

def test():
    test_X = torch.tensor(test_df.iloc[:,:].values)
    test_Y = net(test_X.float())
    survived = torch.max(test_Y, dim=1)[1]
    test_paID = pd.read_csv('/home/manami_furukawa/taitanic/gender_submission.csv')['PassengerId']
    sub_df = pd.DataFrame({"PassengerId":test_paID.values, "Survived":survived})
    print(sub_df)
    return sub_df

#print(train_df.columns)
sub_df = test()
sub_df.to_csv("./submission.csv", index=False)

