## Assignment on Preprocessing 

### Import the necessary libraries:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

### Load the dataset into python environment

In [None]:
data0 = pd.read_csv(r'C:\Users\sajee\Downloads\AATHIRA DSA\titanic_dataset.csv')  
data0.head()

### Make ‘PassengerId’ as the index column

In [None]:
data1 = pd.read_csv(r'C:\Users\sajee\Downloads\AATHIRA DSA\titanic_dataset.csv', index_col='PassengerId')
data1.head()

### Check the basic details of the dataset

In [None]:
# Check the dimensions (number of rows and columns) of the dataset:

data1.shape

In [None]:
# Get a summary of the dataset's columns, data types, and non-null values:

data1.info()

In [None]:
# Compute descriptive statistics of the dataset:

data1.describe()

In [None]:
# Check for count of null values in each column:

data1.isna().sum()

In [None]:
# Plot frequency graphs for columns with data types 'float' and 'int' in the dataset:

freqgraph = data1.select_dtypes(include=['float', 'int'])
freqgraph.hist(figsize=(8, 8))
plt.show()

In [None]:
# Create frequency histogram for the "Age" column to analyze whether the data is normally distributed or skewed:
# Histogram provide a visual representation of the distribution of values in the data:

plt.figure(figsize=(4,3))
plt.hist(data1['Age'])
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.title('Histogram of Age')
plt.show()

### Fill in all the Missing Values present in all the columns in the dataset

In [None]:
# Filling the missing values in the "Age" column of the dataset with the median value of the column:

data1['Age']=data1['Age'].fillna(data1['Age'].median())

In [None]:
# Filling the missing values in categorical data columns('Cabin' & 'Embarked')with the mode value of each respective column:

for col in ['Cabin', 'Embarked']:
    data1[col] = data1[col].fillna(data1[col].mode()[0])

In [None]:
# Again check for count of null values in each column after filling the missing values:

data1.isna().sum()

### Check and Handle Outliers in at least 3 columns in the dataset

In [None]:
# Create box plots for the numerical columns:

num_cols = ['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

for i in num_cols:
    plt.figure(figsize=(5,2.92))
    plt.boxplot(data1[i])
    plt.title(i)

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
# Calculate the quartiles Q1, Q2 (median), and Q3 of the 'Age' column :

Q1 = np.percentile(data1['Age'],25,interpolation='midpoint')
Q2 = np.percentile(data1['Age'],50,interpolation='midpoint')
Q3 = np.percentile(data1['Age'],75,interpolation='midpoint')

print('Q1 =',Q1)
print('Q2 =',Q2)
print('Q3 =',Q3)

In [None]:
# Calculate the interquartile range (IQR):

IQR = Q3 - Q1

print (' IQR =', IQR)

In [None]:
# Define the upper and lower limits for outliers:

up_lim = Q3 + 1.5*IQR
low_lim = Q1 - 1.5*IQR

print ('up_lim = ', up_lim)
print ('low_lim = ', low_lim)

In [None]:
# Handling Outliers

# Flooring And Capping operation is the quantile-based technique involves:
# Flooring (setting a minimum value) for lower values based on a specific percentile,
# Values below the lower percentile are replaced with the value of the lower percentile (flooring)
# Capping (setting a maximum value) for higher values based on another specific percentile,
# Values above the higher percentile are replaced with the value of the higer percentile (capping).

In [None]:
# Handle the outliers in the 'Age' column of the dataset:

data1['Age'] = np.where(data1['Age'] > up_lim, up_lim, np.where(data1['Age'] < low_lim, low_lim, data1['Age']))

In [None]:
plt.figure(figsize=(4, 3))
plt.boxplot(data1['Age'])
plt.title('Boxplot of Age with Handled Outliers')
plt.show()

In [None]:
# Calculate the quartiles Q1, Q2 (median), and Q3 of the 'SibSp' column :

Q1 = np.percentile(data1['SibSp'],25,interpolation='midpoint')
Q2 = np.percentile(data1['SibSp'],50,interpolation='midpoint')
Q3 = np.percentile(data1['SibSp'],75,interpolation='midpoint')

print('Q1 =',Q1)
print('Q2 =',Q2)
print('Q3 =',Q3)

In [None]:
# Calculate the interquartile range (IQR):

IQR = Q3 - Q1

print (' IQR =', IQR)

In [None]:
# Define the upper and lower limits for outliers:

up_lim = Q3 + 1.5*IQR
low_lim = Q1 - 1.5*IQR

print ('up_lim = ', up_lim)
print ('low_lim = ', low_lim)

In [None]:
# Handle the outliers in the 'SibSp' column of the dataset:

data1['SibSp'] = np.where(data1['SibSp'] > up_lim, up_lim, np.where(data1['SibSp'] < low_lim, low_lim, data1['SibSp']))

In [None]:
plt.figure(figsize=(4, 3))
plt.boxplot(data1['SibSp'])
plt.title('Boxplot of SibSp with Handled Outliers')
plt.show()

In [None]:
# Calculate the quartiles Q1, Q2 (median), and Q3 of the 'Fare' column :

Q1 = np.percentile(data1['Fare'],25,interpolation='midpoint')
Q2 = np.percentile(data1['Fare'],50,interpolation='midpoint')
Q3 = np.percentile(data1['Fare'],75,interpolation='midpoint')

print('Q1 =',Q1)
print('Q2 =',Q2)
print('Q3 =',Q3)

In [None]:
# Calculate the interquartile range (IQR):

IQR = Q3 - Q1

print (' IQR =', IQR)

In [None]:
# Define the upper and lower limits for outliers:

up_lim = Q3 + 1.5*IQR
low_lim = Q1 - 1.5*IQR

print ('up_lim = ', up_lim)
print ('low_lim = ', low_lim)

In [None]:
# Handle the outliers in the 'Fare' column of the dataset:

data1['Fare'] = np.where(data1['Fare'] > up_lim, up_lim, np.where(data1['Fare'] < low_lim, low_lim, data1['Fare']))

In [None]:
plt.figure(figsize=(4, 3))
plt.boxplot(data1['Fare'])
plt.title('Boxplot of SibSp with Handled Outliers')
plt.show()

### Do Min Max Scaling on the Feature set (take ‘Survived’ as Target)

In [None]:
# Import the MinMaxScaler from the sklearn.preprocessing:
from sklearn.preprocessing import MinMaxScaler

# Separate the Feature set (X) and Target variable (y):
X = data1.drop('Survived', axis=1)
y = data1['Survived']

# Drop categorical columns before performing scaling on numerical features:
X1 = X.drop(['Name','Sex','Ticket','Cabin','Embarked'],axis = 1)

# Display the first few rows of remaining numerical columns:
X1.head()

In [None]:
# Initialize the MinMaxScaler:
min_max = MinMaxScaler()

# Fit and transform the feature set:
X_scaled = min_max.fit_transform(X1)

# Print the scaled feature set:
print('X_scaled:\n', X_scaled)

# when you perform Min-Max scaling using MinMaxScaler on your feature set X;
# the resulting transformed data will be a NumPy array:
print('Type(X_scaled):\n',type(X_scaled))

In [None]:
# Transform a NumPy array into a DataFrame:
X_scaled = pd.DataFrame(X_scaled,columns = ['Pclass','Age','SibSp','Parch','Fare'])

# Generate descriptive statistics for the DataFrame:
X_scaled.describe()