In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("full.csv")

pd.set_option('display.max_columns', None)

In [3]:
titanic = sns.load_dataset("titanic")

In [4]:
# Eliminar columnas
df = df.drop(columns=['Age', 'Name_wiki', 'WikiId', 'Class', 'Body', 'Lifeboat'])

In [5]:
# Pasar la columna sex de titanic a df y cambiarle el nombre a gender
df['Gender'] = titanic['sex']

In [6]:
#Eliminar la columna 'Sex'
df = df.drop(columns=['Sex'])

In [7]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_wiki,Hometown,Boarded,Destination,Gender
0,1,0.0,3,"Braund, Mr. Owen Harris",1,0,A/5 21171,7.25,,S,22.0,"Bridgerule, Devon, England",Southampton,"Qu'Appelle Valley, Saskatchewan, Canada",male
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0,PC 17599,71.2833,C85,C,35.0,"New York, New York, US",Cherbourg,"New York, New York, US",female
2,3,1.0,3,"Heikkinen, Miss. Laina",0,0,STON/O2. 3101282,7.925,,S,26.0,"Jyväskylä, Finland",Southampton,New York City,female
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0,113803,53.1,C123,S,35.0,"Scituate, Massachusetts, US",Southampton,"Scituate, Massachusetts, US",female
4,5,0.0,3,"Allen, Mr. William Henry",0,0,373450,8.05,,S,35.0,"Birmingham, West Midlands, England",Southampton,New York City,male


In [8]:
# Reemplazar 'New York City' por 'New York, New York, US' en todo el DataFrame
df = df.replace('New York City', 'New York, New York, US')

In [9]:
# Comprobamos
df['Destination'].value_counts()

Destination
New York, New York, US         361
Chicago, Illinois, US           75
Montreal, Quebec, Canada        37
Detroit, Michigan, US           28
Brooklyn, New York, US          25
                              ... 
Gloversville, New York, US       1
Brighton, Massachusetts, US      1
Roachdale, Indiana, US           1
Sherbrooke, Quebec, Canada       1
US                               1
Name: count, Length: 290, dtype: int64

In [10]:
# Cambiar de float a int Age_wiki, Survived para homogeneizar
df['Age_wiki'] = pd.to_numeric(df['Age_wiki'], errors='coerce').fillna(0).astype(int)

In [11]:
df['Survived'] = pd.to_numeric(df['Survived'], errors='coerce').fillna(0).astype(int)

In [12]:
# Comprobamos
df['Age_wiki'].dtype

dtype('int64')

In [13]:
df['Survived'].dtype

dtype('int64')

In [14]:
# Redondear a Fare a 2 decimales
df['Fare'] = df['Fare'].round(2)

In [15]:
# Comprobamos
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_wiki,Hometown,Boarded,Destination,Gender
0,1,0,3,"Braund, Mr. Owen Harris",1,0,A/5 21171,7.25,,S,22,"Bridgerule, Devon, England",Southampton,"Qu'Appelle Valley, Saskatchewan, Canada",male
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0,PC 17599,71.28,C85,C,35,"New York, New York, US",Cherbourg,"New York, New York, US",female
2,3,1,3,"Heikkinen, Miss. Laina",0,0,STON/O2. 3101282,7.92,,S,26,"Jyväskylä, Finland",Southampton,"New York, New York, US",female
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0,113803,53.1,C123,S,35,"Scituate, Massachusetts, US",Southampton,"Scituate, Massachusetts, US",female
4,5,0,3,"Allen, Mr. William Henry",0,0,373450,8.05,,S,35,"Birmingham, West Midlands, England",Southampton,"New York, New York, US",male


In [16]:
# Mapear 0 -> 'No' y 1 -> 'Sí'
df['Survived'] = df['Survived'].map({0: 'No', 1: 'Sí'})

In [17]:
# Comrpobamos
df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_wiki,Hometown,Boarded,Destination,Gender
0,1,No,3,"Braund, Mr. Owen Harris",1,0,A/5 21171,7.25,,S,22,"Bridgerule, Devon, England",Southampton,"Qu'Appelle Valley, Saskatchewan, Canada",male


In [18]:
# Mapear 'male' -> 'M' y 'female' -> 'F'
df['Gender'] = df['Gender'].map({'male': 'M', 'female': 'F'})

In [19]:
# Comrpobamos
df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_wiki,Hometown,Boarded,Destination,Gender
0,1,No,3,"Braund, Mr. Owen Harris",1,0,A/5 21171,7.25,,S,22,"Bridgerule, Devon, England",Southampton,"Qu'Appelle Valley, Saskatchewan, Canada",M


In [20]:
# Mostrar solo las columnas con los valores nulos y el total de ellos
df.isna().sum()[df.isna().sum() > 0]

Fare              1
Cabin          1014
Embarked          2
Hometown          5
Boarded           5
Destination       5
Gender          418
dtype: int64

In [21]:
df['Gender'].value_counts()

Gender
M    577
F    314
Name: count, dtype: int64

In [22]:
# Función para predecir el género en función del título en 'Name'
def predict_gender_from_title(name):
    if 'Mr.' in name:
        return 'M'
    elif 'Mrs.' in name or 'Miss' in name:
        return 'F'
    else:
        return 'Unknown'  # En caso de que no se encuentre un título esperado

# Aplicar la función a las filas donde el valor de 'Gender' es nulo
df['Gender'] = df.apply(lambda row: row['Gender'] if pd.notnull(row['Gender']) else predict_gender_from_title(row['Name']), axis=1)


In [23]:
df['Gender'].value_counts()

Gender
M          817
F          464
Unknown     28
Name: count, dtype: int64

In [None]:
# Cambiar los valores nulos por 'Unknown'
df[''] = df['Cabin'].fillna('unknown')
df['Cabin'] = df['Cabin'].fillna('unknown')
df['Hometown'] = df['Hometown'].fillna('unknown')
df['Destination'] = df['Destination'].fillna('unknown')

In [25]:
# Comprobamos
df.isna().sum()[df.isna().sum() > 0]

Fare        1
Embarked    2
Boarded     5
dtype: int64

In [None]:
# Sacar la fila entera donde Fare tenga el valor nulo para comprobar la clase del pasajero
df[df['Fare'].isnull()]
# Pertenece a tercera clase

Unnamed: 0,PassengerId,Survived,Pclass,Name,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_wiki,Hometown,Boarded,Destination,Gender
1043,1044,No,3,"Storey, Mr. Thomas",0,0,3701,,unknown,S,51,"Liverpool, Merseyside, England,",Southampton,"New York, New York, US",M


In [31]:
# Ver los valores de Fare de la 3º clase
fare_class_3 = df[df['Pclass'] == 3]['Fare']
fare_class_3

0        7.25
2        7.92
4        8.05
5        8.46
7       21.08
        ...  
1303     7.78
1304     8.05
1306     7.25
1307     8.05
1308    22.36
Name: Fare, Length: 709, dtype: float64

In [33]:
# Imoutación del nulo de Fare
# Al comprobar que es de 3º clase se decide hacer la media
# de toda al 3º clase al tratarse de 1 valor nulo

mean_fare_class_3 = df[df['Pclass'] == 3]['Fare'].mean()

# Rellenar los valores nulos en la columna 'Fare' con la media calculada
df['Fare'] = df['Fare'].fillna(mean_fare_class_3)

In [34]:
# Comprobamos
df.isna().sum()[df.isna().sum() > 0]

Embarked    2
Boarded     5
dtype: int64

In [None]:
# Comprobar las filas donde se encuentran 
# los valores nulos de 'Embarked' y 'Boarded'

df[df['Embarked'].isnull()]

# Se ve que tendrían que ser una 'S'

Unnamed: 0,PassengerId,Survived,Pclass,Name,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_wiki,Hometown,Boarded,Destination,Gender
61,62,Sí,1,"Icard, Miss. Amelie",0,0,113572,80.0,B28,,38,"New York, New York, US",Southampton,"New York, New York, US",F
829,830,Sí,1,"Stone, Mrs. George Nelson (Martha Evelyn)",0,0,113572,80.0,B28,,62,"New York, New York, US",Southampton,"New York, New York, US",F


In [36]:
# Reemplazar los valores nulos en la columna 'Embarque' por 'S'
df['Embarked'] = df['Embarked'].fillna('S')

In [38]:
df[df['Boarded'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_wiki,Hometown,Boarded,Destination,Gender
347,348,Sí,3,"Davison, Mrs. Thomas Henry (Mary E Finck)",1,0,386525,16.1,unknown,S,0,unknown,,unknown,F
557,558,No,1,"Robbins, Mr. Victor",0,0,PC 17757,227.52,unknown,C,0,unknown,,unknown,M
1041,1042,No,1,"Earnshaw, Mrs. Boulton (Olive Potter)",0,1,11767,83.16,C54,C,0,unknown,,unknown,F
1048,1049,No,3,"Lundin, Miss. Olga Elida",0,0,347469,7.85,unknown,S,0,unknown,,unknown,F
1228,1229,No,3,"Elias, Mr. Joseph",0,2,2675,7.23,unknown,C,0,unknown,,unknown,M


In [39]:
# Reemplazar los valores nulos en 'A bordo' basándonos en 'Embarque'
df['Boarded'] = df.apply(lambda row: 'Southampton' if row['Embarked'] == 'S' and pd.isnull(row['Boarded']) 
                         else ('Cherbourg' if row['Embarked'] == 'C' and pd.isnull(row['Boarded']) 
                               else row['Boarded']), axis=1)

In [41]:
# Comprobamos
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
Age_wiki       0
Hometown       0
Boarded        0
Destination    0
Gender         0
dtype: int64

In [42]:
# Al tener Boarded completa eliminamos la de Embarked
df = df.drop(columns=['Embarked'])

In [44]:
# Cambiar los nombres a las columnas Pclass y Age_wiki

# Cambiar algunos nombres de columnas utilizando un diccionario
df = df.rename(columns={'Pclass': 'Class', 'Age_wiki': 'Age'})

In [45]:
df.columns

Index(['PassengerId', 'Survived', 'Class', 'Name', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Age', 'Hometown', 'Boarded', 'Destination', 'Gender'],
      dtype='object')

In [46]:
# Guardar el DataFrame limpio y pasarlo a csv
df.to_csv('titanic_limpio.csv', index=False)
