In [3]:
# Exercise 1 - combine the two data sets

import pandas as pd

# Read the county-level and state-level sheets into DataFrames
county_level_df = pd.read_excel("https://github.com/futureCodersSE/python-programming-for-data/blob/main/Datasets/Income-Data.xlsx?raw=true", sheet_name='county-level')
state_level_df = pd.read_excel("https://github.com/futureCodersSE/python-programming-for-data/blob/main/Datasets/Income-Data.xlsx?raw=true", sheet_name='state-level')

def combine_whole():
    # Combine the two DataFrames using pd.concat
    new_df = pd.concat([county_level_df, state_level_df], ignore_index=True, sort=False)
    
    return new_df

# Create new DataFrame from the function
income_df = combine_whole()

# Display the DataFrames and the combined DataFrame
print("County Level DataFrame:")
print(county_level_df.head())
print("\nState Level DataFrame:")
print(state_level_df.head())
print("\nCombined DataFrame:")
print(income_df.head())

# Run test to check if the combined DataFrame contains null values
actual = income_df.isnull().values.any()
expected = True

if actual == expected:
    print("Test passed", actual)
else:
    print("Test failed, expected", expected, "got", actual)


County Level DataFrame:
  State  County  Population  Age  Income
0    TX       1          72   34      65
1    TX       2          33   42      45
2    TX       5          25   23      46
3    TX       6          54   36      65
4    TX       7          11   42      53

State Level DataFrame:
  State    Pop  Age  Income  Education
0    TX  23543   32      54       10.2
1    MD  10343   29      69       10.3
2    IN   5231   41      35       10.1
3    CA  29587   35      67       10.4
4    NY  18142   34      78       10.2

Combined DataFrame:
  State  County  Population  Age  Income  Pop  Education
0    TX     1.0        72.0   34      65  NaN        NaN
1    TX     2.0        33.0   42      45  NaN        NaN
2    TX     5.0        25.0   23      46  NaN        NaN
3    TX     6.0        54.0   36      65  NaN        NaN
4    TX     7.0        11.0   42      53  NaN        NaN
5    TX     8.0        28.0   25      62  NaN        NaN
6    TX     9.0        82.0   35      66  NaN       

In [5]:
# Exercise 2 - ignoring index to get a new indexing system

import pandas as pd

# Read the DataFrames
county_level_df = pd.read_excel("https://github.com/futureCodersSE/python-programming-for-data/blob/main/Datasets/Income-Data.xlsx?raw=true", sheet_name='county-level')
state_level_df = pd.read_excel("https://github.com/futureCodersSE/python-programming-for-data/blob/main/Datasets/Income-Data.xlsx?raw=true", sheet_name='state-level')

def combine_common():
    # Combine the DataFrames with ignore_index=True and join='inner'
    new_df = pd.concat([county_level_df, state_level_df], ignore_index=True, join='inner')
    return new_df

# Create the combined DataFrame from the function
income_df = combine_common()

# Display the last index and number of columns in the combined DataFrame
print("Combined DataFrame:")
print(income_df.head())  # Display the first few rows
print("\nCombined DataFrame Info:")
print(income_df.info())  # Display information about the DataFrame

# Run test to check if the combined DataFrame ends with the correct index and has the right number of columns
actual_last_index = income_df.index[-1]
expected_last_index = 22

if actual_last_index == expected_last_index and len(income_df.columns) == 3:
    print("Test passed", actual_last_index, len(income_df.columns))
else:
    print("Test failed, expected last row index of", expected_last_index, "got", actual_last_index, "and expected 3 columns but got", len(income_df.columns))


Combined DataFrame:
  State  Age  Income
0    TX   34      65
1    TX   42      45
2    TX   23      46
3    TX   36      65
4    TX   42      53

Combined DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   State   23 non-null     object
 1   Age     23 non-null     int64 
 2   Income  23 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 684.0+ bytes
None
Test passed 22 3


In [7]:
# Exercise 3 - add new rows to the end of the income dataframe

import pandas as pd

# Read the DataFrames
county_level_df = pd.read_excel("https://github.com/futureCodersSE/python-programming-for-data/blob/main/Datasets/Income-Data.xlsx?raw=true", sheet_name='county-level')
state_level_df = pd.read_excel("https://github.com/futureCodersSE/python-programming-for-data/blob/main/Datasets/Income-Data.xlsx?raw=true", sheet_name='state-level')

# Combine the county and state DataFrames
def combine_common():
    new_df = pd.concat([county_level_df, state_level_df], ignore_index=True, join='inner')
    return new_df

# Create the initial combined DataFrame
income_df = combine_common()

# Read the new DataFrame
income_new = pd.read_excel("https://github.com/futureCodersSE/python-programming-for-data/blob/main/Datasets/Income-Data.xlsx?raw=true", sheet_name='income')

def combine_income():
    # Append the income_new DataFrame to the income_df DataFrame, ignoring the index
    income_df_v2 = pd.concat([income_df, income_new], ignore_index=True)
    return income_df_v2

# Save the returned DataFrame in a variable
income_df_v2 = combine_income()

# Display the DataFrame and perform tests
print("Combined DataFrame with New Records:")
print(income_df_v2.head())  # Display the first few rows

print("\nCombined DataFrame Info:")
print(income_df_v2.info())  # Display information about the DataFrame

# Run test to check if the new DataFrame has the correct number of rows and columns
actual_last_index = income_df_v2.index[-1]
expected_last_index = 32

if actual_last_index == expected_last_index and len(income_df_v2.columns) == 4:
    print("Test passed", actual_last_index)
else:
    print("Test failed, expected last row index of", expected_last_index, "got", actual_last_index, "and expected 4 columns but got", len(income_df_v2.columns))


Combined DataFrame with New Records:
  State  Age  Income  Pop
0    TX   34      65  NaN
1    TX   42      45  NaN
2    TX   23      46  NaN
3    TX   36      65  NaN
4    TX   42      53  NaN

Combined DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   State   33 non-null     object 
 1   Age     33 non-null     int64  
 2   Income  33 non-null     int64  
 3   Pop     10 non-null     float64
dtypes: float64(1), int64(2), object(1)
memory usage: 1.2+ KB
None
Test passed 32


In [9]:
#Exercise 4 - Create a new Pop column in county_level_df

import pandas as pd

# Read the county-level DataFrame
county_level_df = pd.read_excel("https://github.com/futureCodersSE/python-programming-for-data/blob/main/Datasets/Income-Data.xlsx?raw=true", sheet_name='county-level')

def create_pop():
    # Copy values from 'Population' column to a new 'Pop' column
    county_level_df['Pop'] = county_level_df['Population']
    
    # Drop the 'Population' column
    county_level_df_v2 = county_level_df.drop(['Population'], axis=1)
    
    return county_level_df_v2

# Save returned DataFrame in a variable
actual = create_pop()

# Display the DataFrame and perform tests
print("Updated DataFrame with Pop Column:")
print(actual.head())  # Display the first few rows

# This will run and test your code to see if the new DataFrame contains the Pop column and dropped the Population column
if 'Pop' in actual.columns and len(actual.columns) == 5:
    print("Test passed, contains 5 columns including Pop column")
elif 'Pop' in actual.columns and len(actual.columns) != 5:
    print("Test not passed, expected 5 columns, instead got", len(actual.columns))
else:
    print("Test not passed, column Pop not present")


Updated DataFrame with Pop Column:
  State  County  Age  Income  Pop
0    TX       1   34      65   72
1    TX       2   42      45   33
2    TX       5   23      46   25
3    TX       6   36      65   54
4    TX       7   42      53   11
Test passed, contains 5 columns including Pop column


In [11]:
# Exercise 5 - clean up the format of the Pop column in state_level_df

import pandas as pd

# Read the state-level and county-level DataFrames
state_level_df = pd.read_excel("https://github.com/futureCodersSE/python-programming-for-data/blob/main/Datasets/Income-Data.xlsx?raw=true", sheet_name='state-level')
county_level_df = pd.read_excel("https://github.com/futureCodersSE/python-programming-for-data/blob/main/Datasets/Income-Data.xlsx?raw=true", sheet_name='county-level')

# Create a copy of state_level_df
state_level_df_copy = state_level_df.copy()

def clean_pop():
    # Convert the 'Pop' column from thousands to actual numbers
    state_level_df_copy['Pop'] = (state_level_df_copy['Pop'] * 1000).round()
    
    return state_level_df_copy

# Create new variable with cleaned data
state_level_df_v2 = clean_pop()

# Display the DataFrame to verify changes
print("Updated state_level_df_v2:")
print(state_level_df_v2.head())  # Display the first few rows

# This will run and test your code to see if you've correctly converted the column to thousands
actual = state_level_df_v2['Pop'].max()
expected = 29000

if actual == expected and len(state_level_df_v2.columns) == 5:
    print("Test passed, max Pop is", actual)
else:
    print("Test failed, expected max of", expected, "got", actual)


Updated state_level_df_v2:
  State       Pop  Age  Income  Education
0    TX  23543000   32      54       10.2
1    MD  10343000   29      69       10.3
2    IN   5231000   41      35       10.1
3    CA  29587000   35      67       10.4
4    NY  18142000   34      78       10.2
Test failed, expected max of 29000 got 29587000


In [17]:
# Exercise 6 - combine the two v2 dataframes using concat

import pandas as pd

# Assume the previous functions create_pop() and clean_pop() are already defined and imported

# Create the v2 dataframes using the previous functions
county_level_df_v2 = create_pop()
state_level_df_v2 = clean_pop()

def combine_v2():
    # Combine the two dataframes using inner join and ignore_index=True
    combined_df = pd.concat([county_level_df_v2, state_level_df_v2], join='inner', ignore_index=True)
    return combined_df

# This will run and test your code to see if your new dataframe is correct length and has correct number of columns
income_df_v3 = combine_v2()

# Test the combined dataframe
actual = len(income_df_v3)
actual2 = len(income_df_v3.columns)
expected = 23
expected2 = 4

if actual == expected and actual2 == expected2:
    print("Test passed", actual, "rows and 4 columns")
else:
    print("Test failed, expected", expected, "rows", expected2, "columns but got", actual, "rows and", actual2, "columns")


Test passed 23 rows and 4 columns


In [19]:
# Exercise 7 - Combining dataframes using merge

import pandas as pd

# Read new dataframes skill_df and industry_df
skill_df = pd.read_excel("https://github.com/futureCodersSE/working-with-data/blob/main/Data%20sets/public_use-talent-migration.xlsx?raw=true", sheet_name="Skill Migration")
industry_df = pd.read_excel("https://github.com/futureCodersSE/working-with-data/blob/main/Data%20sets/public_use-talent-migration.xlsx?raw=true", sheet_name="Industry Migration")

def get_combine():
    # Merge dataframes into variable called migration_df
    migration_df = pd.merge(
        skill_df[['country_name', 'skill_group_name', 'wb_income', 'net_per_10K_2019']],
        industry_df[['country_name', 'industry_name', 'wb_income', 'net_per_10K_2019']],
        how='inner',
        left_on=['country_name', 'skill_group_name'],
        right_on=['country_name', 'industry_name']
    )
    
    # Display the merged dataframe's .info()
    print(migration_df.info())
    
    return migration_df

# This will run and test your code to see if your new dataframe is correct length and has correct number of columns
merged = get_combine()
actual = merged.shape[0]
actual2 = merged.shape[1]
expected = 873
expected2 = 7

if actual == expected and actual2 == expected2:
    print("Test passed", actual, "rows", actual2, "columns")
else:
    print("Test failed, expected", expected, "rows", expected2, "columns but got", actual, "rows and", actual2, "columns")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 873 entries, 0 to 872
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   country_name        873 non-null    object 
 1   skill_group_name    873 non-null    object 
 2   wb_income_x         873 non-null    object 
 3   net_per_10K_2019_x  873 non-null    float64
 4   industry_name       873 non-null    object 
 5   wb_income_y         873 non-null    object 
 6   net_per_10K_2019_y  873 non-null    float64
dtypes: float64(2), object(5)
memory usage: 47.9+ KB
None
Test passed 873 rows 7 columns


In [25]:
# Exercise 8 - pivot table of state and population

import pandas as pd
import numpy as np

def create_pivot(df):
    # Create a pivot table of Pop by State
    df_pivot = pd.pivot_table(
        df,
        values='Pop',
        index='State',
        aggfunc=np.mean
    )
    return df_pivot

# Assuming income_df_v3 is already created from previous exercises
income_df_v3 = combine_v2()

# Save the pivot table in a new variable
population_pivot = create_pivot(income_df_v3)

# This will run and test your code to see if your new series is the correct length
actual = len(population_pivot)
expected = 5

if actual == expected:
    print("Test passed", actual)
else:
    print("Test failed expected", expected, "got", actual)


Test passed 5
