In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [7]:
import os
import pandas as pd

print("Working dir:", os.getcwd())
print("Contents:", os.listdir('.'))

# Try typical paths in order
for path in ['train.csv', 'data/train.csv', '../input/titanic/train.csv']:
    if os.path.exists(path):
        df = pd.read_csv(path)
        print(f"Loaded successfully from {path}")
        break
else:
    raise FileNotFoundError("train.csv not found in default locations. Please check path.")

df.info()
df.head()


Working dir: C:\Users\DELL
Contents: ['.anaconda', '.conda', '.condarc', '.continuum', '.cursor', '.ipynb_checkpoints', '.ipython', '.jupyter', '.matplotlib', '.ms-ad', '.recently-used.xbel', '.spyder-py3', '.vscode', 'anaconda3', 'anaconda_projects', 'AppData', 'Application Data', 'call_center_stats.csv', 'cheatSheets.zip', 'Contacts', 'Cookies', 'Correlation_HiP-FA_SMILESeq.xlsx', 'crypto_etl_project', 'Desktop', 'Documents', 'Downloads', 'earthquake_severity_model.pkl', 'edb_mtk.exe', 'edb_npgsql.exe', 'edb_pem_agent.exe', 'edb_pem_agent.exe-20250526235327', 'edb_pem_agent_8.exe', 'edb_pem_server.exe', 'edb_pem_server.exe-20250526235357', 'edb_pem_server_8.exe', 'edb_pgagent_pg17.exe', 'edb_pgbouncer.exe', 'edb_pgjdbc.exe', 'edb_psqlodbc.exe', 'edb_psqlodbc.exe-20250526235449', 'edb_sqlprofiler_pg17.exe', 'edb_xdb_62.exe', 'edb_xdb_7.exe', 'Favorites', 'IntelGraphicsProfiles', 'Jedi', 'Links', 'Local Settings', 'Microsoft', 'Music', 'My Documents', 'NetHood', 'NTUSER.DAT', 'ntuser.d

FileNotFoundError: train.csv not found in default locations. Please check path.

In [None]:
# 🔍 2. Quick Data Overview
print(df.info(), '\n')
print(df.describe(), '\n')
print(df.describe(include=['O']), '\n')
print("Survived value counts:\n", df['Survived'].value_counts(), '\n')


In [None]:
# 🧩 3. Missing Data Visualization
print("Missing values:\n", df.isnull().sum(), '\n')
plt.figure(figsize=(10,6))
sns.heatmap(df.isnull(), cbar=False, yticklabels=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()


In [None]:
# 📊 4. Univariate Analysis – Categorical
fig, axs = plt.subplots(1, 3, figsize=(18,5))

sns.countplot(x='Sex', hue='Survived', data=df, ax=axs[0])
axs[0].set_title('Survival by Sex')

sns.countplot(x='Pclass', hue='Survived', data=df, ax=axs[1])
axs[1].set_title('Survival by Pclass')

sns.countplot(x='Embarked', hue='Survived', data=df, ax=axs[2])
axs[2].set_title('Survival by Embarked')

plt.tight_layout()


In [None]:
# 📉 5. Univariate Analysis – Numerical
fig, axs = plt.subplots(1, 2, figsize=(14,5))

sns.histplot(df['Age'].dropna(), kde=True, ax=axs[0], color='steelblue')
axs[0].set_title('Age Distribution')

sns.histplot(df['Fare'], kde=True, ax=axs[1], color='darkgreen')
axs[1].set_title('Fare Distribution')

plt.tight_layout()


In [None]:
# 🎻 6. Violin Plot: Age vs Sex vs Survival
plt.figure(figsize=(10,6))
sns.violinplot(x='Sex', y='Age', hue='Survived', split=True, data=df, palette='muted')
plt.title('Age Distribution by Sex & Survival')


In [None]:
# 🧮 7. Feature Engineering: Family Size
df['Family_Size'] = df['SibSp'] + df['Parch']
df['Alone'] = np.where(df['Family_Size']==0, 1, 0)

fig, axs = plt.subplots(1, 2, figsize=(14,5))
sns.barplot(x='Family_Size', y='Survived', data=df, ax=axs[0], palette='Blues')
axs[0].set_title('Survival vs Family Size')
sns.barplot(x='Alone', y='Survived', data=df, ax=axs[1], palette='Blues')
axs[1].set_title('Survival: Alone vs With Family')
plt.tight_layout()


In [None]:
# 🎟 8. Binning Age & Fare
df['Age_Range'] = pd.cut(df['Age'], bins=[0,12,20,40,60,80], labels=['Child','Teen','Adult','Mature','Senior'])
df['Fare_Range'] = pd.qcut(df['Fare'], 4)

fig, axs = plt.subplots(1, 2, figsize=(14,5))
sns.barplot(x='Age_Range', y='Survived', data=df, ax=axs[0], palette='Oranges')
axs[0].set_title('Survival by Age Range')
sns.barplot(x='Fare_Range', y='Survived', data=df, ax=axs[1], palette='Greens')
axs[1].set_title('Survival by Fare Range')
plt.xticks(rotation=45)
plt.tight_layout()


In [None]:
# 🔗 9. Correlation & Pair Overview
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Feature Correlations')

sns.pairplot(df[['Survived','Age','Fare','Pclass']], hue='Survived', diag_kind='kde', palette='muted')
plt.suptitle('Pairplot: Numerical Features by Survival', y=1.02)
