## Step 1: Load data + quick overview

- Read CSVs

- Shape, columns, dtypes

- Missing value counts

In [1]:
import pandas as pd

train = pd.read_csv(r"C:\Users\hp\Codveda Projects\Task 2 Data Cleaning and Preprocessing\data\train.csv")
test  = pd.read_csv(r"C:\Users\hp\Codveda Projects\Task 2 Data Cleaning and Preprocessing\data\test.csv")

print("Train shape:", train.shape)
print("Test shape :", test.shape)
print("\nTrain columns:", train.columns.tolist())

# Quick peek
display(train.head(3))

# Missing values
print("\nMissing values (train):\n", train.isna().sum().sort_values(ascending=False))
print("\nMissing values (test):\n", test.isna().sum().sort_values(ascending=False))


Train shape: (891, 12)
Test shape : (418, 11)

Train columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S



Missing values (train):
 Cabin          687
Age            177
Embarked         2
PassengerId      0
Name             0
Pclass           0
Survived         0
Sex              0
Parch            0
SibSp            0
Fare             0
Ticket           0
dtype: int64

Missing values (test):
 Cabin          327
Age             86
Fare             1
Name             0
Pclass           0
PassengerId      0
Sex              0
Parch            0
SibSp            0
Ticket           0
Embarked         0
dtype: int64


## Step 2: Clean missing values

From the dataset, we know:

- Age → many missing

- Cabin → mostly missing

- Embarked → few missing

In [2]:
# Fill missing ages with median
train['Age'] = train['Age'].fillna(train['Age'].median())
test['Age']  = test['Age'].fillna(test['Age'].median())

# Fill Embarked with mode
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])

# Drop Cabin if it exists
if 'Cabin' in train.columns:
    train = train.drop(columns=['Cabin'])

if 'Cabin' in test.columns:
    test = test.drop(columns=['Cabin'])


# For Fare in test (there’s 1 missing)
test['Fare'] = test['Fare'].fillna(test['Fare'].median())

## Step 2: Summary statistics (mean, median, mode, std)

- We’ll compute mean, median, mode, std for numeric features (Age, Fare, SibSp, Parch):

- We will report these for the numeric columns (overall), and optionally by Survived.

In [3]:
numeric_cols = ['Age', 'Fare', 'SibSp', 'Parch']

summary = train[numeric_cols].agg(['mean','median','std']).T
summary['mode'] = train[numeric_cols].mode().iloc[0]
print(summary)


numeric_cols = train.select_dtypes(include="number").columns.tolist()

summary = pd.DataFrame({
    "mean":   train[numeric_cols].mean(),
    "median": train[numeric_cols].median(),
    "std":    train[numeric_cols].std(),
})

# Mode can be multi-valued; we’ll take the first mode per column if it exists
modes = {}
for col in numeric_cols:
    m = train[col].mode(dropna=True)
    modes[col] = m.iloc[0] if len(m) else None
summary["mode"] = pd.Series(modes)

print("Overall Summary (numeric):")
display(summary)

# (Optional) Summary by Survived
grouped_summary = train.groupby("Survived")[numeric_cols].agg(["mean","median","std"])
print("Summary by Survived:")
display(grouped_summary)


            mean   median        std   mode
Age    29.361582  28.0000  13.019697  28.00
Fare   32.204208  14.4542  49.693429   8.05
SibSp   0.523008   0.0000   1.102743   0.00
Parch   0.381594   0.0000   0.806057   0.00
Overall Summary (numeric):


Unnamed: 0,mean,median,std,mode
PassengerId,446.0,446.0,257.353842,1.0
Survived,0.383838,0.0,0.486592,0.0
Pclass,2.308642,3.0,0.836071,3.0
Age,29.361582,28.0,13.019697,28.0
SibSp,0.523008,0.0,1.102743,0.0
Parch,0.381594,0.0,0.806057,0.0
Fare,32.204208,14.4542,49.693429,8.05


Summary by Survived:


Unnamed: 0_level_0,PassengerId,PassengerId,PassengerId,Survived,Survived,Survived,Pclass,Pclass,Pclass,Age,Age,Age,SibSp,SibSp,SibSp,Parch,Parch,Parch,Fare,Fare,Fare
Unnamed: 0_level_1,mean,median,std,mean,median,std,mean,median,std,mean,...,std,mean,median,std,mean,median,std,mean,median,std
Survived,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,447.016393,455.0,260.640469,0.0,0.0,0.0,2.531876,3.0,0.735805,30.028233,...,12.499986,0.553734,0.0,1.288399,0.32969,0.0,0.823166,22.117887,10.5,31.388207
1,444.368421,439.5,252.35884,1.0,1.0,0.0,1.950292,2.0,0.863321,28.291433,...,13.764425,0.473684,0.0,0.708688,0.464912,0.0,0.771712,48.395408,26.0,66.596998


## Step 4: Remove outliers (Age + Fare)

In [4]:
def remove_outliers(df, col):
    Q1, Q3 = df[col].quantile([0.25, 0.75])
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5*IQR, Q3 + 1.5*IQR
    return df[(df[col] >= lower) & (df[col] <= upper)]

# Apply to train
train = remove_outliers(train, 'Age')
train = remove_outliers(train, 'Fare')

# Apply to test
test = remove_outliers(test, 'Age')
test = remove_outliers(test, 'Fare')


### Step 5: Mapping Numerical Codes to Categorical Labels

In [5]:
import pandas as pd

# 1. Change Survived column (0 -> No, 1 -> Yes)
train["Survived"] = train["Survived"].map({0: "No", 1: "Yes"})

# 2. Change Pclass column (1 -> First Class, 2 -> Second Class, 3 -> Third Class)
train["Pclass"] = train["Pclass"].map({
    1: "First Class",
    2: "Second Class",
    3: "Third Class"
})

# Preview updated DataFrame
print(train[["Survived", "Pclass"]].head())


  Survived       Pclass
0       No  Third Class
2      Yes  Third Class
3      Yes  First Class
4       No  Third Class
5       No  Third Class


##  Encode Categorical Variables

In [6]:
# Label Encoding (for ordinal data like Sex):

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
train["Sex"] = label_encoder.fit_transform(train["Sex"])


In [8]:
#'Sex' column contains 0 for female and 1 for male
train["Sex"] = train["Sex"].map({0: "Female", 1: "Male"})

# Preview the updated column
print(train["Sex"].head())


0      Male
2    Female
3    Female
4      Male
5      Male
Name: Sex, dtype: object


In [9]:
# Save cleaned train dataset
train.to_csv(r"C:\Users\hp\Codveda Projects\Task 2 Data Cleaning and Preprocessing\cleaned_train.csv", index=False)

# Save cleaned test dataset
test.to_csv(r"C:\Users\hp\Codveda Projects\Task 2 Data Cleaning and Preprocessing\cleaned_test.csv", index=False)
