In [1]:
import pandas as pd   #data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
#list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [3]:
data = pd.read_csv('/kaggle/input/titanic/train.csv') #Load data that was downloaded from Kaggle, train data
test = pd.read_csv('/kaggle/input/titanic/test.csv') #Load data that was downloaded from Kaggle, test data
test_ids = test["PassengerId"]

def clean(data): #Created a clean function to get some data
    data = data.drop(["Ticket", "PassengerId", "Name", "Cabin"], axis=1)
    
    #Dropped the Ticket, PassengerId, Name and Cabin because I think it doesn't give me a lot of information
    
    cols = ["SibSp", "Parch", "Fare", "Age"] #Columns that don't have a number in them
    for col in cols: #going through the columns
        data[col].fillna(data[col].median(), inplace=True) 
        #Converting the columns to numbers, fill in the numbers that are not filled with their mean.
        
    data.Embarked.fillna("U", inplace=True) #Fill the embarked with missing datapoints with unknown tokens
    return data

data = clean(data)
test = clean(test)

In [4]:
data.head(5) #Drop some columns and limit them to 5

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [5]:
from sklearn import preprocessing #using sklearn to convert strings to actual values
le = preprocessing.LabelEncoder() #Using the label encoder
columns = ["Sex", "Embarked"]

for col in columns:
    data[col] = le.fit_transform(data[col]) #Doing the mapping of the data column
    test[col] = le.transform(test[col]) #Doing the mapping of the data column
    print(le.classes_) # print to see the conversion of the classes to integer e.g Femle is 1
      
data.head(5)

['female' 'male']
['C' 'Q' 'S' 'U']


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [6]:
#Using logistic regression to have a validation set to see how good it is
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split

y = data["Survived"]
X = data.drop("Survived", axis=1) #Dropping the column for the survived

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train) #Logistic Regression to the classifier to specify the random state

In [8]:
predictions = clf.predict(X_val) #Know how good it is on the validation data that it hasn't seen
from sklearn.metrics import accuracy_score #Getting the accuracy
accuracy_score(y_val, predictions)

0.8100558659217877

In [9]:
submission_preds = clf.predict(test) #Getting the submission Predictions

In [10]:
#Generating a CSV file that can be submitted to Kaggle
df = pd.DataFrame({"PassengerId": test_ids.values,
                   "Survived": submission_preds,
                  })

In [11]:
df.to_csv("final submission.csv", index=False)