In [4]:
import pandas as pd
import numpy as np

# Load a sample dataset

In [5]:
df = pd.read_csv('data/titanic.csv')

# 1. Exploring the Data
## Show the first few rows of the dataframe

In [5]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
df.shape

(891, 12)

In [7]:
len(df)

891

### Get all row indexes


In [8]:
indexes = df.index
print(indexes)

RangeIndex(start=0, stop=891, step=1)


### Get all column indexes (names)

In [9]:
col_indexes = df.columns
print(col_indexes)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


### Get the name of the first column

In [10]:
first_column_name = df.columns[0]
print(first_column_name)

PassengerId


### transfer all columns to a list

In [11]:
df.columns.to_list()

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

## Show a summary of the dataframe (rows, columns, missing values)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [13]:
# Check data types
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [15]:
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.to_list()
numerical_cols

['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [29]:
categorical_cols = df.select_dtypes(include=['object', 'category', 'bool']).columns.to_list()
categorical_cols

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [17]:
df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [18]:
df['Embarked'].nunique()

3

In [19]:
df['Sex'].unique()

array(['male', 'female'], dtype=object)

In [20]:
df['Sex'].nunique()

2

### Continuous and Discrete columns 

In [21]:
continuous_cols = [col for col in numerical_cols if df[col].nunique() > 20 ]
discrete_cols = [col for col in numerical_cols if df[col].nunique() <= 20 ]

In [22]:
continuous_cols

['PassengerId', 'Age', 'Fare']

In [23]:
discrete_cols

['Survived', 'Pclass', 'SibSp', 'Parch']

# Identifying Numerical and Categorical Data
## List all numerical columns

In [24]:

numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("Numerical columns:", numerical_cols)

Numerical columns: ['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']


## List all categorical columns

In [30]:
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
print("Categorical columns:", categorical_cols)

Categorical columns: ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


# Homework 1: Introduction to Tabular Data

 **Create a Feature Type Dictionary:**
   - Load the Titanic dataset into a DataFrame.
   - Create a dictionary that classifies features:
     - **Numerical (continuous or discrete):** List of numerical features.
     - **Categorical (nominal or ordinal):** List of categorical features.

   ```python
   feature_types = {
       'numerical': {
           'continuous': [],  # Fill with continuous numerical features
           'discrete': []  # Fill with discrete numerical features
       },
       'categorical': {
           'nominal': [],  # Fill with nominal categorical features
           'ordinal': []  # Fill with ordinal categorical features
       }
   }


In [34]:
df = pd.read_csv('data/titanic.csv')
numerical_cols = df.select_dtypes(include = ['int64','float64']).columns.tolist()
categorical_cols = df.select_dtypes(include = ['object', 'category', 'bool']).columns.to_list()
feature_types = {
    'numerical': {
        'continuous': [],
        'discrete': [] 
    },
    'categorical': {
        'nominal': [], 
        'ordinal': [] 
    }
}
continuous_cols = [col for col in numerical_cols if df[col].nunique() > 20 ]
discrete_cols = [col for col in numerical_cols if df[col].nunique() <= 20 ]
feature_types['numerical']['continuous'] = continuous_cols
feature_types['numerical']['discrete'] = discrete_cols
nominal_cols = [col for col in categorical_cols if col in ['Name', 'Sex', 'Embarked']]
ordinal_cols = [col for col in categorical_cols if col in ['Pclass']]
feature_types['categorical']['nominal'] = nominal_cols
feature_types['categorical']['ordinal'] = ordinal_cols
print(feature_types)


{'numerical': {'continuous': ['PassengerId', 'Age', 'Fare'], 'discrete': ['Survived', 'Pclass', 'SibSp', 'Parch']}, 'categorical': {'nominal': ['Name', 'Sex', 'Embarked'], 'ordinal': []}}


# Homework 2: Create a DataFrame with Only Numerical Features:
### Use DataFrame selection to create a new DataFrame that contains only numerical features from the Titanic dataset.
```python
numerical_df = df[['list', 'of', 'numerical', 'features']]



In [41]:
df = pd.read_csv('data/titanic.csv')
numerical_df = df.select_dtypes(include=[np.number])
numerical_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
0,1,0,3,22.0,1,0,7.25
1,2,1,1,38.0,1,0,71.2833
2,3,1,3,26.0,0,0,7.925
3,4,1,1,35.0,1,0,53.1
4,5,0,3,35.0,0,0,8.05


# Homework 3: Display Unique Values for Categorical Columns:
### Select the categorical features and display the unique values for each categorical column.




In [45]:
df = pd.read_csv('data/titanic.csv')
categorical_cols = df.select_dtypes(include = ['object', 'category', 'bool']).columns.to_list()
for col in categorical_cols:
    unique_values = df[col].unique()
    print(f"Unique values in '{col}': {unique_values}\n")

Unique values in 'Name': ['Braund, Mr. Owen Harris'
 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)'
 'Heikkinen, Miss. Laina' 'Futrelle, Mrs. Jacques Heath (Lily May Peel)'
 'Allen, Mr. William Henry' 'Moran, Mr. James' 'McCarthy, Mr. Timothy J'
 'Palsson, Master. Gosta Leonard'
 'Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)'
 'Nasser, Mrs. Nicholas (Adele Achem)' 'Sandstrom, Miss. Marguerite Rut'
 'Bonnell, Miss. Elizabeth' 'Saundercock, Mr. William Henry'
 'Andersson, Mr. Anders Johan' 'Vestrom, Miss. Hulda Amanda Adolfina'
 'Hewlett, Mrs. (Mary D Kingcome) ' 'Rice, Master. Eugene'
 'Williams, Mr. Charles Eugene'
 'Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)'
 'Masselmani, Mrs. Fatima' 'Fynney, Mr. Joseph J' 'Beesley, Mr. Lawrence'
 'McGowan, Miss. Anna "Annie"' 'Sloper, Mr. William Thompson'
 'Palsson, Miss. Torborg Danira'
 'Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)'
 'Emir, Mr. Farred Chehab' 'Fortune, Mr. Charles Alexander'
 'O\'Dwyer, Miss

# Homework 4: Create a Simple Summary Table:
### Create a summary DataFrame with columns such as Feature Name, Data Type, Number of Unique Values, and Has Missing Values?.


In [47]:
df = pd.read_csv('data/titanic.csv')
summary_df = pd.DataFrame({'Feature Name':df.columns,
                           'Data Type':df.dtypes.values,
                           'Number of Unique Values':[df[col].nunique() for col in df.columns],
                           'Has Missing Values>':[df[col].isnull().any() for col in df.columns]
                          })
print(summary_df)

   Feature Name Data Type  Number of Unique Values  Has Missing Values>
0   PassengerId     int64                      891                False
1      Survived     int64                        2                False
2        Pclass     int64                        3                False
3          Name    object                      891                False
4           Sex    object                        2                False
5           Age   float64                       88                 True
6         SibSp     int64                        7                False
7         Parch     int64                        7                False
8        Ticket    object                      681                False
9          Fare   float64                      248                False
10        Cabin    object                      147                 True
11     Embarked    object                        3                 True


In [9]:
def load_titanic_data(filepath: str) -> pd.DataFrame:
    """
    Loads the Titanic dataset from the specified file path.
    
    Args:
        filepath (str): Path to the Titanic CSV file.
    
    Returns:
        pd.DataFrame: Loaded Titanic dataset as a DataFrame.
    """
    df = pd.read_csv(filepath)
    return df
print(load_titanic_data('data/titanic.csv'))

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

In [14]:
def create_feature_type_dict(df):
    """
    Classifies features into numerical (continuous or discrete) and categorical (nominal or ordinal).
    
    Args:
        df (pd.DataFrame): The Titanic dataset as a DataFrame.
    
    Returns:
        dict: A dictionary classifying features into numerical and categorical types.
    """
    feature_types = {
        'numerical': {
            'continuous': [],
            'discrete': [] 
            },
        'categorical': {
            'nominal': [],  
            'ordinal': []  
            }
        }
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.to_list()
    continuous_cols = [col for col in numerical_cols if df[col].nunique()>20]
    discrete_cols = [col for col in numerical_cols if df[col].nunique()<=20]
    feature_types['numerical']['continuous'] = continuous_cols
    feature_types['numerical']['discrete'] = discrete_cols
    
    categorical_cols = df.select_dtypes(include = ['object', 'category', 'bool']).columns.to_list()
    nominal_cols = [col for col in categorical_cols if col in ['Name', 'Sex', 'Embarked']]
    ordinal_cols = [col for col in categorical_cols if col in ['Pclass']]
    feature_types['categorical']['nominal'] = nominal_cols
    feature_types['categorical']['ordinal'] = ordinal_cols
        
    return feature_types

print(create_feature_type_dict(df))


{'numerical': {'continuous': ['PassengerId', 'Age', 'Fare'], 'discrete': ['Survived', 'Pclass', 'SibSp', 'Parch']}, 'categorical': {'nominal': ['Name', 'Sex', 'Embarked'], 'ordinal': []}}


In [17]:
def get_numerical_df(df, numerical_features):
    """
    Creates a DataFrame containing only numerical features.
    
    Args:
        df (pd.DataFrame): The Titanic dataset as a DataFrame.
        numerical_features (list): List of numerical feature names.
    
    Returns:
        pd.DataFrame: DataFrame containing only numerical features.
    """
    numerical_df = df[numerical_features].select_dtypes(include=[np.number])
    return numerical_df

numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
print(get_numerical_df(df, numerical_features))

     PassengerId  Survived  Pclass   Age  SibSp  Parch     Fare
0              1         0       3  22.0      1      0   7.2500
1              2         1       1  38.0      1      0  71.2833
2              3         1       3  26.0      0      0   7.9250
3              4         1       1  35.0      1      0  53.1000
4              5         0       3  35.0      0      0   8.0500
..           ...       ...     ...   ...    ...    ...      ...
886          887         0       2  27.0      0      0  13.0000
887          888         1       1  19.0      0      0  30.0000
888          889         0       3   NaN      1      2  23.4500
889          890         1       1  26.0      0      0  30.0000
890          891         0       3  32.0      0      0   7.7500

[891 rows x 7 columns]


In [18]:
def create_summary_table(df):
    """
    Creates a summary DataFrame with feature name, data type, number of unique values, and if it has missing values.
    
    Args:
        df (pd.DataFrame): The Titanic dataset as a DataFrame.
    
    Returns:
        pd.DataFrame: A summary DataFrame.
    """
    summary_df = pd.DataFrame({
        'Feature Name': df.columns,
        'Data Type': df.dtypes.values,
        'Number of Unique Values': [df[col].nunique() for col in df.columns],
        'Has Missing Values': [df[col].isnull().any() for col in df.columns]
        })
    return summary_df
print(create_summary_table(df))



   Feature Name Data Type  Number of Unique Values  Has Missing Values
0   PassengerId     int64                      891               False
1      Survived     int64                        2               False
2        Pclass     int64                        3               False
3          Name    object                      891               False
4           Sex    object                        2               False
5           Age   float64                       88                True
6         SibSp     int64                        7               False
7         Parch     int64                        7               False
8        Ticket    object                      681               False
9          Fare   float64                      248               False
10        Cabin    object                      147                True
11     Embarked    object                        3                True


In [4]:
import pandas as pd
df = pd.read_csv('data/titanic.csv')
categorical_features = df.select_dtypes(include = ['object', 'category','bool']).columns.to_list()
def display_unique_values(df, categorical_features):
    """
    Displays unique values for each categorical feature in the DataFrame.
    
    Args:
        df (pd.DataFrame): The Titanic dataset as a DataFrame.
        categorical_features (list): List of categorical feature names.
    
    Returns:
        dict: A dictionary where keys are feature names and values are the unique values.
    """
    unique_values_dict = {}
    for col in categorical_features:
        unique_values = df[col].unique()
        unique_values_dict[col] = unique_values
    return unique_values_dict
print(display_unique_values(df, categorical_features))

{'Name': array(['Braund, Mr. Owen Harris',
       'Cumings, Mrs. John Bradley (Florence Briggs Thayer)',
       'Heikkinen, Miss. Laina',
       'Futrelle, Mrs. Jacques Heath (Lily May Peel)',
       'Allen, Mr. William Henry', 'Moran, Mr. James',
       'McCarthy, Mr. Timothy J', 'Palsson, Master. Gosta Leonard',
       'Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)',
       'Nasser, Mrs. Nicholas (Adele Achem)',
       'Sandstrom, Miss. Marguerite Rut', 'Bonnell, Miss. Elizabeth',
       'Saundercock, Mr. William Henry', 'Andersson, Mr. Anders Johan',
       'Vestrom, Miss. Hulda Amanda Adolfina',
       'Hewlett, Mrs. (Mary D Kingcome) ', 'Rice, Master. Eugene',
       'Williams, Mr. Charles Eugene',
       'Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)',
       'Masselmani, Mrs. Fatima', 'Fynney, Mr. Joseph J',
       'Beesley, Mr. Lawrence', 'McGowan, Miss. Anna "Annie"',
       'Sloper, Mr. William Thompson', 'Palsson, Miss. Torborg Danira',
       'Asplund, Mrs. Ca