# Cleaning the Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mlt

In [2]:
df = pd.read_csv("train.csv")

In [3]:
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
# Ensure the Age column is not completely NaN
if df['Age'].notna().sum() > 0:
    df['Age'] = df['Age'].fillna(df['Age'].median())
else:
    print("All values in 'Age' are NaN. Using a default value (e.g., 0).")
    df['Age'] = df['Age'].fillna(0)  # Or use any reasonable default value


In [7]:
df['Age'].dtype

dtype('float64')

In [8]:
print(df['Age'].head())  # Check the first few values
print(df['Age'].isna().sum())  # Count the number of NaN values
print(df['Age'].dtype)  # Check the data type of the column


0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64
0
float64


In [9]:
print(df['Age'].unique())  # Check unique values in the column


[22.   38.   26.   35.   28.   54.    2.   27.   14.    4.   58.   20.
 39.   55.   31.   34.   15.    8.   19.   40.   66.   42.   21.   18.
  3.    7.   49.   29.   65.   28.5   5.   11.   45.   17.   32.   16.
 25.    0.83 30.   33.   23.   24.   46.   59.   71.   37.   47.   14.5
 70.5  32.5  12.    9.   36.5  51.   55.5  40.5  44.    1.   61.   56.
 50.   36.   45.5  20.5  62.   41.   52.   63.   23.5   0.92 43.   60.
 10.   64.   13.   48.    0.75 53.   57.   80.   70.   24.5   6.    0.67
 30.5   0.42 34.5  74.  ]


In [10]:
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')  # Converts non-numeric values to NaN


In [11]:
if df['Age'].notna().sum() > 0:
    df['Age'] = df['Age'].fillna(df['Age'].median())
else:
    print("All values in 'Age' are NaN. Using a default value (e.g., 0).")
    df['Age'] = df['Age'].fillna(0)  # Replace NaN with a default value


In [12]:
df['Age'].isnull().sum()

np.int64(0)

In [13]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [14]:
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

In [15]:
df['Embarked'].isnull().sum()

np.int64(0)

In [16]:
df.drop(columns = ['Cabin'], inplace= True)

In [17]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [26]:
 unique_value = df['PassengerId'].unique()
unique_value.shape

(891,)

In [None]:

# Assuming df is your DataFrame
df['TicketPrefix'] = df['Ticket'].str.extract(r'([A-Za-z./]+)', expand=False)  # Extract prefix
df['TicketNumber'] = df['Ticket'].str.extract(r'(\d+)', expand=False)          # Extract numeric part

# Fill missing prefixes with 'None' (if numeric tickets have no prefix)
df['TicketPrefix'] = df['TicketPrefix'].fillna('None')

# Display cleaned columns
print(df[['Ticket', 'TicketPrefix', 'TicketNumber']])


# Performing Descriptive Statistics

In [None]:
df.describe()

In [None]:
df.groupby('Sex')['Survived'].mean()

In [None]:
class_survival = df.groupby('Pclass')['Survived'].mean()
print(class_survival)

In [None]:
# Plot a histogram of ages for survivors and non-survivors
import matplotlib.pyplot as plt
sns.histplot(df[df['Survived'] == 1]['Age'], bins=20, kde=True, color='green', label='Survived')
sns.histplot(df[df['Survived'] == 0]['Age'], bins=20, kde=True, color='red', label='Not Survived')
plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Age Distribution: Survivors vs Non-Survivors')
plt.legend()
plt.show()

# Compare the average age of survivors and non-survivors
avg_age_survived = df[df['Survived'] == 1]['Age'].mean()
avg_age_not_survived = df[df['Survived'] == 0]['Age'].mean()

print(f"Average Age of Survivors: {avg_age_survived:.2f}")
print(f"Average Age of Non-Survivors: {avg_age_not_survived:.2f}")


In [None]:
print(df['Embarked'].value_counts())
sns.countplot(x='Embarked', hue='Survived', data=df)
plt.title('Survival by Embarkation Port')
plt.show()


In [None]:
survival_by_class_and_sex = df.groupby(['Pclass', 'Sex'])['Survived'].mean()
print(survival_by_class_and_sex)

sns.barplot(x='Pclass', y='Survived', hue='Sex', data=df)
plt.title('Survival by Class and Gender')
plt.show()

# Predictive Analysis

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load Titanic dataset
df = pd.read_csv("train.csv")

# Select relevant features and preprocess
df = df[['Pclass', 'Sex', 'Age', 'Fare', 'Survived']].dropna()
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})  # Convert categorical to numerical

# Split dataset
X = df[['Pclass', 'Sex', 'Age', 'Fare']]
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.7552447552447552
              precision    recall  f1-score   support

           0       0.81      0.78      0.80        87
           1       0.68      0.71      0.70        56

    accuracy                           0.76       143
   macro avg       0.74      0.75      0.75       143
weighted avg       0.76      0.76      0.76       143

