In [1]:
# Exercise 1 - open the Titanic dataset and see descriptive info
import pandas as pd

# Step 1: Read the Titanic dataset into a pandas DataFrame
url = "https://raw.githubusercontent.com/pandas-dev/pandas/master/doc/data/titanic.csv"
titanic = pd.read_csv(url)

# Step 2: Define the summary function
def summary(df):
    print("First 5 rows of the dataset:")
    display(df.head())
    print("\nTechnical summary of the data:")
    display(df.info())
    print("\nNumerical summary of the data:")
    display(df.describe())

# Step 3: Call the summary function with the Titanic dataset
summary(titanic)

First 5 rows of the dataset:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S



Technical summary of the data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


None


Numerical summary of the data:


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [9]:
# Exercise 2 - displaying other statistics
import pandas as pd

# Step 1: Read the Titanic dataset into a pandas DataFrame
url = "https://raw.githubusercontent.com/pandas-dev/pandas/master/doc/data/titanic.csv"
titanic = pd.read_csv(url)

# Step 2: Define the get_statistics function
def get_statistics(df):
    # Total number of passengers
    total_passengers = len(df)
    
    # Age of the youngest passenger
    youngest_age = df['Age'].min()
    
    # The most expensive ticket price
    most_expensive_ticket = df['Fare'].max()
    
    # The range of ticket prices
    fare_range = df['Fare'].max() - df['Fare'].min()
    
    # The number of passengers with cabins
    passengers_with_cabins = df['Cabin'].notna().sum()
    
    # The code for the port where the highest number of passengers embarked
    most_common_port = df['Embarked'].mode()[0]
    
    # The most populous gender
    most_populous_gender = df['Sex'].mode()[0]
    
    # The standard deviation for age
    age_std_dev = df['Age'].std()
    
    # Return the statistics in a dictionary
    return {
        'Total Passengers': total_passengers,
        'Youngest Age': youngest_age,
        'Most Expensive Ticket': most_expensive_ticket,
        'Fare Range': fare_range,
        'Passengers with Cabins': passengers_with_cabins,
        'Most Common Port': most_common_port,
        'Most Populous Gender': most_populous_gender,
        'Age Standard Deviation': age_std_dev
    }

# Step 3: Call the get_statistics function with the Titanic dataset
statistics = get_statistics(titanic)
for stat, value in statistics.items():
    print(f"{stat}: {value}")

# Step 4: Test
actual = get_statistics(titanic)
expected = (891, 0.42, 512.3292, 512.3292, 204, 'S', 'male', 14.526497332334042)

if actual == expected:
    print("Test passed", actual)
else:
    print("Test failed, expected", expected, "but got", actual)


Total Passengers: 891
Youngest Age: 0.42
Most Expensive Ticket: 512.3292
Fare Range: 512.3292
Passengers with Cabins: 204
Most Common Port: S
Most Populous Gender: male
Age Standard Deviation: 14.526497332334044
Test failed, expected (891, 0.42, 512.3292, 512.3292, 204, 'S', 'male', 14.526497332334042) but got {'Total Passengers': 891, 'Youngest Age': 0.42, 'Most Expensive Ticket': 512.3292, 'Fare Range': 512.3292, 'Passengers with Cabins': 204, 'Most Common Port': 'S', 'Most Populous Gender': 'male', 'Age Standard Deviation': 14.526497332334044}


In [11]:
# Exercise 3 - aggregating statistics grouped by category

import pandas as pd

# Step 1: Read the Titanic dataset into a pandas DataFrame
url = "https://raw.githubusercontent.com/pandas-dev/pandas/master/doc/data/titanic.csv"
titanic = pd.read_csv(url)

# Step 2: Define the get_grouped function
def get_grouped():
    df = titanic
    
    # The mean age for male versus female Titanic passengers
    mean_age_by_sex = df.groupby('Sex')['Age'].mean()
    print("Mean Age by Sex:")
    print(mean_age_by_sex)
    print()
    
    # The mean ticket fare price for each of the sex and cabin class combinations
    mean_fare_by_class_and_sex = df.groupby(['Pclass', 'Sex'])['Fare'].mean()
    print("Mean Fare by Class and Sex:")
    print(mean_fare_by_class_and_sex)
    print()
    
    # The mean ticket fare price for passengers who embarked at each port
    mean_fare_by_embarked = df.groupby('Embarked')['Fare'].mean()
    print("Mean Fare by Embarked Port:")
    print(mean_fare_by_embarked)
    print()
    
    # The number of survivors in each passenger class
    survivors_by_class = df[df['Survived'] == 1].groupby('Pclass')['Survived'].count()
    print("Number of Survivors by Class:")
    print(survivors_by_class)

# Step 3: Call the get_grouped function to display the results
get_grouped()


Mean Age by Sex:
Sex
female    27.915709
male      30.726645
Name: Age, dtype: float64

Mean Fare by Class and Sex:
Pclass  Sex   
1       female    106.125798
        male       67.226127
2       female     21.970121
        male       19.741782
3       female     16.118810
        male       12.661633
Name: Fare, dtype: float64

Mean Fare by Embarked Port:
Embarked
C    59.954144
Q    13.276030
S    27.079812
Name: Fare, dtype: float64

Number of Survivors by Class:
Pclass
1    136
2     87
3    119
Name: Survived, dtype: int64


In [22]:
# Exercise 4 - using iloc
import pandas as pd

# Step 1: Read the Titanic dataset into a pandas DataFrame
url = "https://raw.githubusercontent.com/pandas-dev/pandas/master/doc/data/titanic.csv"
titanic = pd.read_csv(url)

# Step 2: Define the get_middle function
def get_middle():
    df = titanic
    num_records = df.shape[0]
    
    # Calculate the starting index for the middle 20 records
    start_index = (num_records - 20) // 2
    
    # Select the middle 20 records using iloc
    middle_records = df.iloc[start_index:start_index + 20]
    
    return middle_records

# Step 3: Call the get_middle function to test it
actual = get_middle().index[0]
expected = 435

if actual == expected:
    print("Test passed", actual)
else:
    print("Test failed, expected index of", expected, "but got", actual)

# Display the middle 20 records
display(get_middle())


Test passed 435


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
435,436,1,1,"Carter, Miss Lucile Polk",female,14.0,1,2,113760,120.0,B96 B98,S
436,437,0,3,"Ford, Miss Doolina Margaret ""Daisy""",female,21.0,2,2,W./C. 6608,34.375,,S
437,438,1,2,"Richards, Mrs. Sidney (Emily Hocking)",female,24.0,2,3,29106,18.75,,S
438,439,0,1,"Fortune, Mr. Mark",male,64.0,1,4,19950,263.0,C23 C25 C27,S
439,440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31.0,0,0,C.A. 18723,10.5,,S
440,441,1,2,"Hart, Mrs. Benjamin (Esther Ada Bloomfield)",female,45.0,1,1,F.C.C. 13529,26.25,,S
441,442,0,3,"Hampe, Mr. Leon",male,20.0,0,0,345769,9.5,,S
442,443,0,3,"Petterson, Mr. Johan Emil",male,25.0,1,0,347076,7.775,,S
443,444,1,2,"Reynaldo, Ms. Encarnacion",female,28.0,0,0,230434,13.0,,S
444,445,1,3,"Johannesen-Bratthammer, Mr. Bernt",male,,0,0,65306,8.1125,,S


In [54]:
# Exercise 5 - migration to and from

import pandas as pd

# Define the URL for the Excel file
url = "https://github.com/futureCodersSE/working-with-data/blob/main/Data%20sets/public_use-talent-migration.xlsx?raw=true"

# Read the "Country Migration" sheet into a DataFrame
def read_country_migration():
    return pd.read_excel(url, sheet_name="Country Migration")

# Function to clean column names by stripping any extra spaces
def clean_column_names(df):
    df.columns = df.columns.str.strip()
    return df

# Inspect and clean column names
def inspect_cleaned_columns():
    country_df = read_country_migration()
    country_df = clean_column_names(country_df)
    print("Cleaned Columns in the DataFrame:", country_df.columns.tolist())
    print("\nFirst few rows of the DataFrame:")
    print(country_df.head())

# Check column names and data availability
inspect_cleaned_columns()

# Define the function to get rows where migration is to the UK
def get_uk_mig():
    country_df = read_country_migration()
    country_df = clean_column_names(country_df)
    
    # Print columns to ensure the correct column names are used
    print("Cleaned Columns in the DataFrame (inside get_uk_mig):", country_df.columns.tolist())
    
    # Filter rows where migration is to the United Kingdom
    uk_migration = country_df[country_df['target_country_name'] == 'United Kingdom']
    return uk_migration

# Run and test the function
uk_migration_df = get_uk_mig()
actual = len(uk_migration_df)
expected = 122

if actual == expected:
    print("Test passed", actual)
else:
    print("Test failed, expected", expected, "but got", actual)



Cleaned Columns in the DataFrame: ['base_country_code', 'base_country_name', 'base_lat', 'base_long', 'base_country_wb_income', 'base_country_wb_region', 'target_country_code', 'target_country_name', 'target_lat', 'target_long', 'target_country_wb_income', 'target_country_wb_region', 'net_per_10K_2015', 'net_per_10K_2016', 'net_per_10K_2017', 'net_per_10K_2018', 'net_per_10K_2019']

First few rows of the DataFrame:
  base_country_code     base_country_name   base_lat  base_long  \
0                ae  United Arab Emirates  23.424076  53.847818   
1                ae  United Arab Emirates  23.424076  53.847818   
2                ae  United Arab Emirates  23.424076  53.847818   
3                ae  United Arab Emirates  23.424076  53.847818   
4                ae  United Arab Emirates  23.424076  53.847818   

  base_country_wb_income      base_country_wb_region target_country_code  \
0            High Income  Middle East & North Africa                  af   
1            High Income  

In [56]:
# Exercise 6 - how many countries are migrated to

import pandas as pd

# Define the URL for the Excel file
url = "https://github.com/futureCodersSE/working-with-data/blob/main/Data%20sets/public_use-talent-migration.xlsx?raw=true"

# Read the "Country Migration" sheet into a DataFrame
def read_country_migration():
    return pd.read_excel(url, sheet_name="Country Migration")

# Function to clean column names by stripping any extra spaces
def clean_column_names(df):
    df.columns = df.columns.str.strip()
    return df

# Define the function to get the number of unique countries migrated from
def migration():
    country_df = read_country_migration()
    country_df = clean_column_names(country_df)
    
    # Inspect columns to get the exact name for the 'from' country
    print("Cleaned Columns in the DataFrame (inside migration):", country_df.columns.tolist())
    
    # Column that contains the country names from which people have migrated
    from_countries = country_df['base_country_name']
    
    # Count unique countries
    unique_from_countries = from_countries.nunique()
    return unique_from_countries

# Run and test the function
actual = migration()
expected = 140

if actual == expected:
    print("Test passed", actual)
else:
    print("Test failed, expected", expected, "but got", actual)


Cleaned Columns in the DataFrame (inside migration): ['base_country_code', 'base_country_name', 'base_lat', 'base_long', 'base_country_wb_income', 'base_country_wb_region', 'target_country_code', 'target_country_name', 'target_lat', 'target_long', 'target_country_wb_income', 'target_country_wb_region', 'net_per_10K_2015', 'net_per_10K_2016', 'net_per_10K_2017', 'net_per_10K_2018', 'net_per_10K_2019']
Test passed 140
