In [None]:
#Import the necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

#Load the dataset
df_titanic = pd.read_csv('titanic-Indivi.csv')
df_titanic.head()

#Data Preparation
##Check all missing data
df_titanic.isnull().sum()

#Dealing With Missing Data
##Fill missing values in the Embarked column with the mode of the column
df_titanic['Embarked'].fillna(df_titanic['Embarked'].mode()[0],inplace=True)

##Extract prefix from name
df_titanic['Title'] = df_titanic.Name.str.extract('([A-Za-z]+\.)')

##Convert 'Title' to Upper Case to ensure we get accurate mean age of each initial
df_titanic['Title'] = df_titanic['Title'].str.upper()

##This counts each title in the column with it frequency
title_counts = df_titanic['Title'].value_counts()
print(title_counts)

##Fill missing values in the "Age" column based on the mean age for each title
df_titanic['Age'].fillna(df_titanic.groupby('Title')['Age'].transform('mean'), inplace=True)

#Feature Engineering
##Create a new column FamilySize. There are 2 columns related to family size, 
##parch indicates parent or children number, Sibsp indicates sibling and spouse number.
#FamilySize = Parch + SibSp + 1
#This creates a new column 'FamilySize' by adding the 'Parch' and 'SibSp' columns and then + 1
df_titanic['FamilySize'] = df_titanic['Parch'] + df_titanic['SibSp'] + 1

#This will create a dataframe containing only the rows where the "Name" column contains the string 'Asplund'.
asplund_passengers = df_titanic[df_titanic['Name'].str.contains('Asplund')]

#Modeling
##Construct regression model with statsmodels. Select Pclass, Embarked, FamilySize as independent variables.

#Convert 'Fare' column to numeric
df_titanic['Fare'] = pd.to_numeric(df_titanic['Fare'].replace('[\$,]', '', regex=True), errors='coerce')

#Fit the regression model using the formula API
result = smf.ols(formula="Fare ~ C(Pclass) + C(Embarked) + FamilySize", data=df_titanic).fit()

#Show results
print(result.summary())