In [17]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

In [18]:
dataset = pd.read_csv("titanic_data.csv")
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [19]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


In [20]:
missing_values = dataset.isnull().sum()
print(missing_values)

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64


In [21]:
dataset.dropna(subset=['Survived'], inplace=True)
dataset.dropna(subset=['Embarked'], inplace=True)

In [22]:
dataset['Age'].fillna(dataset['Age'].median(), inplace=True)

In [23]:
dataset.drop(['Cabin'] , axis=1, inplace=True)
dataset.drop(['Name'] , axis=1, inplace=True)
dataset.drop(['Ticket'] , axis=1, inplace=True)
dataset.drop(['PassengerId'] , axis=1, inplace=True)

In [24]:
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0.0,3,male,22.0,1,0,7.25,S
1,1.0,1,female,38.0,1,0,71.2833,C
2,1.0,3,female,26.0,0,0,7.925,S
3,1.0,1,female,35.0,1,0,53.1,S
4,0.0,3,male,35.0,0,0,8.05,S


In [25]:
l = LabelEncoder()
dataset['Sex'] = l.fit_transform(dataset['Sex'])
dataset['Embarked'] = l.fit_transform(dataset['Embarked'])
dataset

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0.0,3,1,22.0,1,0,7.2500,2
1,1.0,1,0,38.0,1,0,71.2833,0
2,1.0,3,0,26.0,0,0,7.9250,2
3,1.0,1,0,35.0,1,0,53.1000,2
4,0.0,3,1,35.0,0,0,8.0500,2
...,...,...,...,...,...,...,...,...
886,0.0,2,1,27.0,0,0,13.0000,2
887,1.0,1,0,19.0,0,0,30.0000,2
888,0.0,3,0,28.0,1,2,23.4500,2
889,1.0,1,1,26.0,0,0,30.0000,0


In [26]:
scaler = MinMaxScaler(feature_range=(0, 5))
fare_column = dataset['Fare'].values.reshape(-1, 1)
scaled_fare_column = scaler.fit_transform(fare_column)
dataset['Fare'] = scaled_fare_column.flatten()

In [27]:
dataset

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0.0,3,1,22.0,1,0,0.070755,2
1,1.0,1,0,38.0,1,0,0.695679,0
2,1.0,3,0,26.0,0,0,0.077343,2
3,1.0,1,0,35.0,1,0,0.518221,2
4,0.0,3,1,35.0,0,0,0.078563,2
...,...,...,...,...,...,...,...,...
886,0.0,2,1,27.0,0,0,0.126872,2
887,1.0,1,0,19.0,0,0,0.292781,2
888,0.0,3,0,28.0,1,2,0.228857,2
889,1.0,1,1,26.0,0,0,0.292781,0


In [28]:
X = dataset.drop('Survived', axis=1)
y = dataset['Survived']

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
from sklearn.linear_model import Ridge

linridge = Ridge(alpha=0.5)
linridge.fit(X_train, y_train)

print('ridge regression linear model intercept: {}'
     .format(linridge.intercept_))
print('ridge regression linear model coeff:\n{}'
     .format(linridge.coef_))
print('R-squared score (training): {:.3f}'
     .format(linridge.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linridge.score(X_test, y_test)))
print('Number of non-zero features: {}'
     .format(np.sum(linridge.coef_ != 0)))


ridge regression linear model intercept: 1.4145827953815309
ridge regression linear model coeff:
[-0.17970697 -0.5063396  -0.00645546 -0.04533632 -0.01187719  0.01854518
 -0.04268842]
R-squared score (training): 0.397
R-squared score (test): 0.385
Number of non-zero features: 7


In [40]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix

clf1=DecisionTreeClassifier()
clf1.fit(X_train, y_train)

# Predict on the test set
y_pred = clf1.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.7415730337078652
Classification Report:
               precision    recall  f1-score   support

         0.0       0.82      0.73      0.78       109
         1.0       0.64      0.75      0.69        69

    accuracy                           0.74       178
   macro avg       0.73      0.74      0.74       178
weighted avg       0.75      0.74      0.74       178

Confusion Matrix:
 [[80 29]
 [17 52]]


In [41]:
from sklearn.linear_model import Lasso

linlasso = Lasso(alpha=0.001)
linlasso.fit(X_train, y_train)

print('lasso regression linear model intercept: {}'
     .format(linlasso.intercept_))
print('lasso regression linear model coeff:\n{}'
     .format(linlasso.coef_))
print('Non-zero features: {}'
     .format(np.sum(linlasso.coef_ != 0)))
print('R-squared score (training): {:.3f}'
     .format(linlasso.score(X_train, y_train)))
print('R-squared score (test): {:.3f}\n'
     .format(linlasso.score(X_test, y_test)))


lasso regression linear model intercept: 1.4122406917201578
lasso regression linear model coeff:
[-0.18031314 -0.50338718 -0.00640968 -0.04471932 -0.00943899  0.01301486
 -0.04210677]
Non-zero features: 7
R-squared score (training): 0.397
R-squared score (test): 0.385



In [42]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix

clf=DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.7359550561797753
Classification Report:
               precision    recall  f1-score   support

         0.0       0.82      0.73      0.77       109
         1.0       0.64      0.74      0.68        69

    accuracy                           0.74       178
   macro avg       0.73      0.74      0.73       178
weighted avg       0.75      0.74      0.74       178

Confusion Matrix:
 [[80 29]
 [18 51]]
