In [1]:
import pandas as pd

# Re-importing the data to ensure correct DataFrame types
train_data = pd.read_csv(r'C:\Users\Shrey\OneDrive\Desktop\data mining\train.csv')
test_data = pd.read_csv(r'C:\Users\Shrey\OneDrive\Desktop\data mining\test.csv')

# Dropping the unnecessary columns from test_data
test_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# Verifying data types
print(type(train_data))  # Should output: <class 'pandas.core.frame.DataFrame'>
print(type(test_data))   # Should output: <class 'pandas.core.frame.DataFrame'>

# Selecting features and making sure that `train_data` and `test_data` remain DataFrames
selected_features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

# Selecting columns
train_data = train_data[selected_features + ['Survived']]
test_data = test_data[selected_features]

# Displaying the first few rows to ensure correct selection
print(train_data.head())


<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
   Pclass     Sex   Age  SibSp  Parch     Fare Embarked  Survived
0       3    male  22.0      1      0   7.2500        S         0
1       1  female  38.0      1      0  71.2833        C         1
2       3  female  26.0      0      0   7.9250        S         1
3       1  female  35.0      1      0  53.1000        S         1
4       3    male  35.0      0      0   8.0500        S         0


In [2]:
print(train_data.isnull().sum())

Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
Survived      0
dtype: int64


In [3]:
train_data['Age'] = pd.to_numeric(train_data['Age'], errors='coerce')

train_data['Age'].fillna(train_data['Age'].median(), inplace=True)

train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)

test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
test_data['Embarked'].fillna(test_data['Embarked'].mode()[0], inplace=True)


In [4]:
print(train_data.isnull().sum())

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
Survived    0
dtype: int64


In [5]:
train_data = pd.get_dummies(train_data, columns=['Sex', 'Embarked'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Sex', 'Embarked'], drop_first=True)

print(train_data.head())


   Pclass   Age  SibSp  Parch     Fare  Survived  Sex_male  Embarked_Q  \
0       3  22.0      1      0   7.2500         0         1           0   
1       1  38.0      1      0  71.2833         1         0           0   
2       3  26.0      0      0   7.9250         1         0           0   
3       1  35.0      1      0  53.1000         1         0           0   
4       3  35.0      0      0   8.0500         0         1           0   

   Embarked_S  
0           1  
1           0  
2           1  
3           1  
4           1  


In [6]:
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1

print(train_data[['SibSp', 'Parch', 'FamilySize']].head())


   SibSp  Parch  FamilySize
0      1      0           2
1      1      0           2
2      0      0           1
3      1      0           2
4      0      0           1


In [7]:
print(train_data.head())

   Pclass   Age  SibSp  Parch     Fare  Survived  Sex_male  Embarked_Q  \
0       3  22.0      1      0   7.2500         0         1           0   
1       1  38.0      1      0  71.2833         1         0           0   
2       3  26.0      0      0   7.9250         1         0           0   
3       1  35.0      1      0  53.1000         1         0           0   
4       3  35.0      0      0   8.0500         0         1           0   

   Embarked_S  FamilySize  
0           1           2  
1           0           2  
2           1           1  
3           1           2  
4           1           1  


In [8]:
train_data['AgeBin'] = pd.cut(train_data['Age'], bins=[0, 12, 18, 60, 80], labels=[0, 1, 2, 3])
test_data['AgeBin'] = pd.cut(test_data['Age'], bins=[0, 12, 18, 60, 80], labels=[0, 1, 2, 3])

print(train_data[['Age', 'AgeBin']].head())


    Age AgeBin
0  22.0      2
1  38.0      2
2  26.0      2
3  35.0      2
4  35.0      2


In [9]:
from sklearn.model_selection import train_test_split

X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Splitting the data
X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Train the model with increased max_iter
model = LogisticRegression(max_iter=500)
model.fit(X_train_scaled, y_train)

# Predict on validation set
y_pred = model.predict(X_val_scaled)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)

print(f"Accuracy: {accuracy}")
print(conf_matrix)


Accuracy: 0.7988826815642458
[[89 16]
 [20 54]]


In [16]:
from sklearn.metrics import classification_report

print(classification_report(y_val, y_pred))


              precision    recall  f1-score   support

           0       0.82      0.85      0.83       105
           1       0.77      0.73      0.75        74

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179

