In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

print("Libraries Imported Successfully!")

Libraries Imported Successfully!


In [3]:
dataset_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(dataset_url)

print("Data Loaded. First 5 rows:")
print(df.head())

Data Loaded. First 5 rows:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450  

In [4]:
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

#Age column mein jo data missing hai use median age se fill kara
median_age = df['Age'].median()
df['Age'] = df['Age'].fillna(median_age)

# Embarked column mein 2 missing values hain unhe S (sabse common port) se fill kara hai
df['Embarked'] = df['Embarked'].fillna('S')

print("\nData Cleaned. Missing values check:")
print(df.isnull().sum())


Data Cleaned. Missing values check:
Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


In [5]:
# Sex column ko 0 (male) aur 1 (female) mein convert kaarliya
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# Embarked column ko 0, 1, 2 mein convert kara hai 
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

print("\nData Encoded. First 5 rows:")
print(df.head())


Data Encoded. First 5 rows:
   Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0         0       3    0  22.0      1      0   7.2500         0
1         1       1    1  38.0      1      0  71.2833         1
2         1       3    1  26.0      0      0   7.9250         0
3         1       1    1  35.0      1      0  53.1000         0
4         0       3    0  35.0      0      0   8.0500         0


In [6]:
# y = Target (jise predict karna hai)
y = df['Survived']

# X = Features (jinke basis par predict karna hai)
X = df.drop('Survived', axis=1)

print(f"\nFeatures (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")


Features (X) shape: (891, 7)
Target (y) shape: (891,)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Training data shape: (712, 7)
Testing data shape: (179, 7)


In [8]:
# Logistic Regression model ko create karo
model = LogisticRegression()

# Model ko training data par train (fit) karo
model.fit(X_train, y_train)

print("\nModel Trained Successfully!")


Model Trained Successfully!


In [10]:
predictions = model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)

print(f"Final Model Accuracy: {accuracy * 100:.2f}%")

Final Model Accuracy: 79.89%
