In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv('F:\\Titanic\\train.csv')

In [3]:
print('First few rows of the training dataset: ')
print(train_df.head())

First few rows of the training dataset: 
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0      

In [4]:
print("\nSummary statistics of the training dataset:")
print(train_df.describe())


Summary statistics of the training dataset:
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  


In [5]:
print("\nInformation about the training dataset:")
print(train_df.info())


Information about the training dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


# Data Preprocessing

#### Handle Missing Values

In [8]:
# Step 1: Impute missing ages with the mean age
mean_age = train_df['Age'].mean()
train_df['Age'].fillna(mean_age, inplace=True)

In [9]:
# Step 2: Drop the 'Cabin' column
train_df.drop('Cabin', axis=1, inplace=True)

In [10]:
# Step 3: Impute missing 'Embarked' values with the most common port
most_common_embarked = train_df['Embarked'].mode()[0]
train_df['Embarked'].fillna(most_common_embarked, inplace=True)

In [11]:
# Check if there are any remaining missing values
print("Number of missing values after preprocessing:")
print(train_df.isnull().sum())

Number of missing values after preprocessing:
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


In [12]:
# Step 1: Convert categorical variables using one-hot encoding
train_df = pd.get_dummies(train_df, columns=['Sex', 'Embarked'], drop_first=True)

In [13]:
# Step 2: Feature Engineering (Example: FamilySize)
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1

In [14]:
# Check the updated dataframe
print("Updated dataframe after converting categorical variables and feature engineering:")
print(train_df.head())

Updated dataframe after converting categorical variables and feature engineering:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  38.0      1      0   
2                             Heikkinen, Miss. Laina  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  35.0      1      0   
4                           Allen, Mr. William Henry  35.0      0      0   

             Ticket     Fare  Sex_male  Embarked_Q  Embarked_S  FamilySize  
0         A/5 21171   7.2500         1           0           1           2  
1          PC 17599  71.2833         0           0           0           2  
2  STON/O2. 3

# Model Selection and training

### Logistic regression model using scikit-learn

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Splitting the dataset into features and target variable
X = train_df.drop(['PassengerId', 'Survived', 'Name', 'Ticket'], axis=1)
y = train_df['Survived']

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of Logistic Regression model:", accuracy)


Accuracy of Logistic Regression model: 0.7988826815642458


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Decision tree model

In [17]:
from sklearn.tree import DecisionTreeClassifier

# Initialize and train the Decision Tree model
decision_tree_model = DecisionTreeClassifier(random_state=42)
decision_tree_model.fit(X_train, y_train)

# Predict on the test set
y_pred_dt = decision_tree_model.predict(X_test)

# Calculate accuracy
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Accuracy of Decision Tree model:", accuracy_dt)

Accuracy of Decision Tree model: 0.7821229050279329


### Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest model
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = random_forest_model.predict(X_test)

# Calculate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Accuracy of Random Forest model:", accuracy_rf)

Accuracy of Random Forest model: 0.8212290502793296


### Support Vector Machine

In [19]:
from sklearn.svm import SVC

# Initialize and train the Support Vector Machine (SVM) model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred_svm = svm_model.predict(X_test)

# Calculate accuracy
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("Accuracy of Support Vector Machine (SVM) model:", accuracy_svm)

Accuracy of Support Vector Machine (SVM) model: 0.7821229050279329


### Gradient Boosting

In [20]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize and train the Gradient Boosting model
gradient_boosting_model = GradientBoostingClassifier(random_state=42)
gradient_boosting_model.fit(X_train, y_train)

# Predict on the test set
y_pred_gb = gradient_boosting_model.predict(X_test)

# Calculate accuracy
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print("Accuracy of Gradient Boosting model:", accuracy_gb)

Accuracy of Gradient Boosting model: 0.7988826815642458


# Improving the model

### Feature engineering 

In [30]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# Read the original training and test data
train_df = pd.read_csv("F:\\Titanic\\train.csv")

# Feature engineering
# Title from Name
train_df['Title'] = train_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
train_df['Title'] = train_df['Title'].replace(['Lady', 'Countess', 'Dona'], 'Royalty')
train_df['Title'] = train_df['Title'].replace(['Mme'], 'Mrs')
train_df['Title'] = train_df['Title'].replace(['Mlle', 'Ms'], 'Miss')
train_df['Title'] = train_df['Title'].replace(['Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer'], 'Special')

# Family Size
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1

# Is Alone
train_df['IsAlone'] = 0
train_df.loc[train_df['FamilySize'] == 1, 'IsAlone'] = 1

# Age Group
train_df['AgeGroup'] = pd.cut(train_df['Age'], bins=[0, 12, 18, 60, 200], labels=['Child', 'Teenager', 'Adult', 'Elderly'])

# Fare per Person
train_df['FarePerPerson'] = train_df['Fare'] / train_df['FamilySize']

# Cabin Deck
train_df['Deck'] = train_df['Cabin'].str[:1]
train_df['Deck'].fillna('Unknown', inplace=True)

# Convert categorical variables into dummy/indicator variables
train_df = pd.get_dummies(train_df, columns=['Sex', 'Embarked', 'Title', 'AgeGroup', 'Deck'])

# Drop unnecessary columns
train_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
imputer.fit(train_df)
train_df_imputed = pd.DataFrame(imputer.transform(train_df), columns=train_df.columns)

# Split the data into features and target variable
X = train_df_imputed.drop('Survived', axis=1)
y = train_df_imputed['Survived']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model with the new features
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = random_forest_model.predict(X_test)

# Calculate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Accuracy of Random Forest model with new features:", accuracy_rf)

Accuracy of Random Forest model with new features: 0.8324022346368715


# Model evaluation

In [23]:
from sklearn.metrics import classification_report, confusion_matrix

In [24]:
# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))



Classification Report:
              precision    recall  f1-score   support

         0.0       0.85      0.87      0.86       105
         1.0       0.81      0.78      0.79        74

    accuracy                           0.83       179
   macro avg       0.83      0.83      0.83       179
weighted avg       0.83      0.83      0.83       179



In [25]:
# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))


Confusion Matrix:
[[91 14]
 [16 58]]
