AIM #1: Loading the dataset and printing basic information 
1. Import the Titanic dataset using pandas
2. Create a Dataframe from the dataset
3. Print the first 10 rows of the dataset
4. Print the last 20 rows of the dataset
5. Print dataset's information
6. Describe the dataset
7. Make sure all the information returned by the different functions are displayed in a single table and not on multiple ines

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import io

url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
titanic_df = pd.read_csv(url)

first_10_rows = titanic_df.head(10)
last_20_rows = titanic_df.tail(20)

buffer = io.StringIO()
titanic_df.info(buf=buffer)
dataset_info = buffer.getvalue()

description = titanic_df.describe()

plt.figure(figsize=(8,6))
plt.hist(titanic_df['Age'].dropna(), bins=30, edgecolor='black')
plt.title('Age Distribution of Titanic Passengers')
plt.xlabel('Age')
plt.ylabel('Frequency')

plt.show()

result = {
    "First 10 Rows": first_10_rows,
    "Last 20 Rows": last_20_rows,
    "Dataset Info": dataset_info,
    "Description": description
}
print(result)

dAIM #2: Finding issues (empty, NAs, incorrect value, incorrect format, outliers, etc.) 
1. Find out how many missing values there are in the dataset
2. For the 'Age' column, find the best way to handle the missing values
    2.1. Use an appropriate plot to study the nature of the 'Age' column
    2.2. Figure out what is the best way to calculate the central tendency of the 'Age' column based on the above plot
    2.3. Using the most suitable central tendency measure, fill the missing values in the age column
3. Decide what is the best way to handle the missing values in the 'Cabin' columns
4. Similarly, decide what is the best way to handle the missing values in the 'Embarked' columns
5. Handle the incorrect data under the 'Survived' columns using appropriate measure
6. Handle the incorrectly formatted data under the 'Fare' column

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
titanic_df = pd.read_csv(url)

missing_values = titanic_df.isnull().sum()

plt.figure(figsize=(8,6))
sns.histplot(titanic_df['Age'], kde=True, bins=30)
plt.title('Age Distribution with Missing Values')
plt.show()

median_age = titanic_df['Age'].median()

titanic_df['Age'].fillna(median_age, inplace=True)

titanic_df.drop('Cabin', axis=1, inplace=True)

mode_embarked = titanic_df['Embarked'].mode()[0]
titanic_df['Embarked'].fillna(mode_embarked, inplace=True)

titanic_df['Survived'] = titanic_df['Survived'].apply(lambda x: 1 if x > 1 else (0 if x < 0 else x))

titanic_df['Fare'] = pd.to_numeric(titanic_df['Fare'], errors='coerce')

final_missing_values = titanic_df.isnull().sum()

print("Missing Values Before Handling:")
print(missing_values)
print("\nMissing Values After Handling:")
print(final_missing_values)

print(titanic_df.head())

AIM #3: Grouping 
1. Find out the average fare grouped by Pclass
    1.1. Plot the above using a suitable plot
2. Find out the average fare grouped by Sex
    2.1. Plot the above using a suitable plot

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Load Titanic dataset from seaborn
titanic = sns.load_dataset('titanic')

# Group by Pclass and find the average fare
avg_fare_pclass = titanic.groupby('pclass')['fare'].mean().reset_index()

# Plotting
plt.figure(figsize=(8,6))
sns.barplot(x='pclass', y='fare', data=avg_fare_pclass, palette='Blues_d')
plt.title('Average Fare by Pclass')
plt.ylabel('Average Fare')
plt.xlabel('Pclass')
plt.show()

# Group by Sex and find the average fare
avg_fare_sex = titanic.groupby('sex')['fare'].mean().reset_index()

# Plotting
plt.figure(figsize=(8,6))
sns.barplot(x='sex', y='fare', data=avg_fare_sex, palette='coolwarm')
plt.title('Average Fare by Sex')
plt.ylabel('Average Fare')
plt.xlabel('Sex')
plt.show()


AIM #4: Dataset visualization using pandas

1. Plot the distribution of 'Age' using a suitable plot
2. Plot the distribution of 'Fare' using a suitable plot
3. Plot the distribution of 'Pclass' using a suitable plot
4. Plot the distribution of 'Survived' using a suitable plot
5. Plot the distribution of 'Embarked' using a suitable plot
6. Plot the distribution of 'Fare' grouped by 'Survived'
7. Plot the distribution of 'Fare' grouped by 'Pclass'
8. Plot the distribution of 'Age' grouped by 'Survived'
9. Plot the distribution of 'Age' grouped by 'PClass'
10. Combine the 'SibSp' and 'Parch' and plot its distribution grouped by 'Survived'
11. Combine the 'SibSp' and 'Parch' and plot its distribution grouped by 'Pclass'
12. Plot a distribution between 'Age' and 'Fare' to see if there's any relationship
13. Are there any other possibilities to show relationships?

In [None]:
#1.
plt.figure(figsize=(8,6))
sns.histplot(titanic['age'].dropna(), kde=True, bins=30)
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.show()

In [None]:
#2.
plt.figure(figsize=(8,6))
sns.histplot(titanic['fare'].dropna(), kde=True, bins=30, color='orange')
plt.title('Distribution of Fare')
plt.xlabel('Fare')
plt.show()

In [None]:
#3.
plt.figure(figsize=(8,6))
sns.countplot(x='pclass', data=titanic, palette='Blues_d')
plt.title('Distribution of Pclass')
plt.xlabel('Pclass')
plt.show()

In [None]:
#4.
plt.figure(figsize=(8,6))
sns.countplot(x='survived', data=titanic, palette='coolwarm')
plt.title('Distribution of Survived')
plt.xlabel('Survived (0 = No, 1 = Yes)')
plt.show()

In [None]:
#5.
plt.figure(figsize=(8,6))
sns.countplot(x='embarked', data=titanic, palette='Set2')
plt.title('Distribution of Embarked')
plt.xlabel('Embarked')
plt.show()

In [None]:
#6.
plt.figure(figsize=(8,6))
sns.boxplot(x='survived', y='fare', data=titanic, palette='coolwarm')
plt.title('Fare Distribution Grouped by Survived')
plt.xlabel('Survived (0 = No, 1 = Yes)')
plt.ylabel('Fare')
plt.show()

In [None]:
#7.
plt.figure(figsize=(8,6))
sns.boxplot(x='pclass', y='fare', data=titanic, palette='Blues_d')
plt.title('Fare Distribution Grouped by Pclass')
plt.xlabel('Pclass')
plt.ylabel('Fare')
plt.show()

In [None]:
#8.
plt.figure(figsize=(8,6))
sns.boxplot(x='survived', y='age', data=titanic, palette='coolwarm')
plt.title('Age Distribution Grouped by Survived')
plt.xlabel('Survived (0 = No, 1 = Yes)')
plt.ylabel('Age')
plt.show()

In [None]:
#9.
plt.figure(figsize=(8,6))
sns.boxplot(x='pclass', y='age', data=titanic, palette='Blues_d')
plt.title('Age Distribution Grouped by Pclass')
plt.xlabel('Pclass')
plt.ylabel('Age')
plt.show()

In [None]:
#10.
# Combine 'SibSp' and 'Parch'
titanic['family_size'] = titanic['sibsp'] + titanic['parch']

# Plot grouped by 'Survived'
plt.figure(figsize=(8,6))
sns.boxplot(x='survived', y='family_size', data=titanic, palette='coolwarm')
plt.title('Family Size Distribution Grouped by Survived')
plt.xlabel('Survived (0 = No, 1 = Yes)')
plt.ylabel('Family Size (SibSp + Parch)')
plt.show()

In [None]:
#11.
plt.figure(figsize=(8,6))
sns.boxplot(x='pclass', y='family_size', data=titanic, palette='Blues_d')
plt.title('Family Size Distribution Grouped by Pclass')
plt.xlabel('Pclass')
plt.ylabel('Family Size (SibSp + Parch)')
plt.show()

In [None]:
#12.
plt.figure(figsize=(8,6))
sns.scatterplot(x='age', y='fare', data=titanic, hue='pclass', palette='coolwarm')
plt.title('Relationship Between Age and Fare')
plt.xlabel('Age')
plt.ylabel('Fare')
plt.show()

In [None]:
#13.
sns.pairplot(titanic[['age', 'fare', 'pclass', 'survived']], hue='survived', palette='coolwarm')
plt.show()

AIM #5: Correlation

1. Generate a correlation matrix for the entire dataset
2. Find correlation between 'Age' and 'Fare'
3. What other possible correlations can be found in the dataset?

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('titanic.csv')

# Check data types of all columns
print(df.dtypes)

# Check for non-numeric values in 'Age' column
print(df['Age'].unique())

# Replace 'Zero' with 0 in the 'Age' column
df['Age'] = df['Age'].replace('Zero', 0)

# Check again for non-numeric values
print(df['Age'].unique())

# Generate the correlation matrix
correlation_matrix = df.corr()

# Display the correlation matrix
print(correlation_matrix)

# Find the correlation between 'Age' and 'Fare'
age_fare_correlation = correlation_matrix['Age']['Fare']
print(f"Correlation between 'Age' and 'Fare': {age_fare_correlation}")