# Transforming DataFrames

The homelessness DataFrame contains estimates of homelessness in each U.S. state in 2018. 

The individual column is the number of homeless individuals not part of a family with children. 

The family_members column is the number of homeless individuals part of a family with children. 

The state_pop column is the state's total population.

In [None]:
import pandas as pd

homelessness = pd.read_csv('/work/data_science_notes/3. Data manipulation with pandasw/data/homelessness.csv', index_col=0)

## Introducing DataFrames

### Inspecting a DataFrame


In [None]:
# Print the head of the homelessness DataFrame
homelessness.head()

Unnamed: 0,region,state,individuals,family_members,state_pop
0,East South Central,Alabama,2570.0,864.0,4887681
1,Pacific,Alaska,1434.0,582.0,735139
2,Mountain,Arizona,7259.0,2606.0,7158024
3,West South Central,Arkansas,2280.0,432.0,3009733
4,Pacific,California,109008.0,20964.0,39461588


In [None]:
# Print information about the column types and missing values in homelessness
homelessness.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51 entries, 0 to 50
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   region          51 non-null     object 
 1   state           51 non-null     object 
 2   individuals     51 non-null     float64
 3   family_members  51 non-null     float64
 4   state_pop       51 non-null     int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 4.4+ KB


In [None]:
# Print the number of rows and columns in homelessness
homelessness.shape

(51, 5)

In [None]:
# Print some summary statistics that describe the homelessness DataFrame
homelessness.describe()

Unnamed: 0,individuals,family_members,state_pop
count,51.0,51.0,51.0
mean,7225.784314,3504.882353,6405637.0
std,15991.025083,7805.411811,7327258.0
min,434.0,75.0,577601.0
25%,1446.5,592.0,1777414.0
50%,3082.0,1482.0,4461153.0
75%,6781.5,3196.0,7340946.0
max,109008.0,52070.0,39461590.0


### Parts of a DataFrame


In [None]:
# Print a 2D NumPy array of the values in homelessness
homelessness.values

array([['East South Central', 'Alabama', 2570.0, 864.0, 4887681],
       ['Pacific', 'Alaska', 1434.0, 582.0, 735139],
       ['Mountain', 'Arizona', 7259.0, 2606.0, 7158024],
       ['West South Central', 'Arkansas', 2280.0, 432.0, 3009733],
       ['Pacific', 'California', 109008.0, 20964.0, 39461588],
       ['Mountain', 'Colorado', 7607.0, 3250.0, 5691287],
       ['New England', 'Connecticut', 2280.0, 1696.0, 3571520],
       ['South Atlantic', 'Delaware', 708.0, 374.0, 965479],
       ['South Atlantic', 'District of Columbia', 3770.0, 3134.0, 701547],
       ['South Atlantic', 'Florida', 21443.0, 9587.0, 21244317],
       ['South Atlantic', 'Georgia', 6943.0, 2556.0, 10511131],
       ['Pacific', 'Hawaii', 4131.0, 2399.0, 1420593],
       ['Mountain', 'Idaho', 1297.0, 715.0, 1750536],
       ['East North Central', 'Illinois', 6752.0, 3891.0, 12723071],
       ['East North Central', 'Indiana', 3776.0, 1482.0, 6695497],
       ['West North Central', 'Iowa', 1711.0, 1038.0, 3148618]

In [None]:
# Print the column names of homelessness
homelessness.columns

Index(['region', 'state', 'individuals', 'family_members', 'state_pop'], dtype='object')

In [None]:
# Print the index of homelessness
homelessness.index

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
            34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
            50],
           dtype='int64')

## Sorting and subsetting


### Sorting rows


In [None]:
# Sort homelessness by the number of homeless individuals, from smallest to largest, and save this as homelessness_ind
# Print the head of the sorted DataFrame

homelessness_ind = homelessness.sort_values('individuals').head()
homelessness_ind.head()

Unnamed: 0,region,state,individuals,family_members,state_pop
50,Mountain,Wyoming,434.0,205.0,577601
34,West North Central,North Dakota,467.0,75.0,758080
7,South Atlantic,Delaware,708.0,374.0,965479
39,New England,Rhode Island,747.0,354.0,1058287
45,New England,Vermont,780.0,511.0,624358


In [None]:
# Sort homelessness by the number of homeless family_members in descending order, and save this as homelessness_fam
# Print the head of the sorted DataFrame
homelessness_fam = homelessness.sort_values('family_members', ascending=False)
homelessness_fam.head()

Unnamed: 0,region,state,individuals,family_members,state_pop
32,Mid-Atlantic,New York,39827.0,52070.0,19530351
4,Pacific,California,109008.0,20964.0,39461588
21,New England,Massachusetts,6811.0,13257.0,6882635
9,South Atlantic,Florida,21443.0,9587.0,21244317
43,West South Central,Texas,19199.0,6111.0,28628666


In [None]:
# Sort homelessness first by region (ascending), and then by number of family members (descending) 
# Save this as homelessness_reg_fam
# Print the head of the sorted DataFrame

homelessness_reg_fam = homelessness.sort_values(['region', 'family_members'], ascending=[True, False])
homelessness_reg_fam.head(10)

Unnamed: 0,region,state,individuals,family_members,state_pop
13,East North Central,Illinois,6752.0,3891.0,12723071
35,East North Central,Ohio,6929.0,3320.0,11676341
22,East North Central,Michigan,5209.0,3142.0,9984072
49,East North Central,Wisconsin,2740.0,2167.0,5807406
14,East North Central,Indiana,3776.0,1482.0,6695497
42,East South Central,Tennessee,6139.0,1744.0,6771631
17,East South Central,Kentucky,2735.0,953.0,4461153
0,East South Central,Alabama,2570.0,864.0,4887681
24,East South Central,Mississippi,1024.0,328.0,2981020
32,Mid-Atlantic,New York,39827.0,52070.0,19530351


### Subsetting columns

In [None]:
# Create a DataFrame called individuals that contains only the individuals column of homelessness
# Print the head of the result
individuals = homelessness['individuals']
individuals.head() 

0      2570.0
1      1434.0
2      7259.0
3      2280.0
4    109008.0
Name: individuals, dtype: float64

In [None]:
# Create a DataFrame called state_fam that contains only the state and family_members columns of homelessness, in that order
# Print the head of the result
state_fam = homelessness[['state', 'family_members']]
state_fam.head()

Unnamed: 0,state,family_members
0,Alabama,864.0
1,Alaska,582.0
2,Arizona,2606.0
3,Arkansas,432.0
4,California,20964.0


In [None]:
# Create a DataFrame called ind_state that contains the individuals and state columns of homelessness, in that order
# Print the head of the result

ind_state = homelessness[['individuals', 'state']]
ind_state.head()

Unnamed: 0,individuals,state
0,2570.0,Alabama
1,1434.0,Alaska
2,7259.0,Arizona
3,2280.0,Arkansas
4,109008.0,California


In [None]:
# Filter homelessness for cases where the number of individuals is greater than ten thousand, assigning to ind_gt_10k 
# View the printed result
ind_gt_10k = homelessness[homelessness['individuals'] > 10000]
ind_gt_10k

Unnamed: 0,region,state,individuals,family_members,state_pop
4,Pacific,California,109008.0,20964.0,39461588
9,South Atlantic,Florida,21443.0,9587.0,21244317
32,Mid-Atlantic,New York,39827.0,52070.0,19530351
37,Pacific,Oregon,11139.0,3337.0,4181886
43,West South Central,Texas,19199.0,6111.0,28628666
47,Pacific,Washington,16424.0,5880.0,7523869


### Subsetting rows by categorical variables

In [None]:
# Filter homelessness for cases where the USA Census region is "Mountain", assigning to mountain_reg 
# View the printed result
mountain_reg = homelessness[homelessness['region'] == 'Mountain']
mountain_reg

Unnamed: 0,region,state,individuals,family_members,state_pop
2,Mountain,Arizona,7259.0,2606.0,7158024
5,Mountain,Colorado,7607.0,3250.0,5691287
12,Mountain,Idaho,1297.0,715.0,1750536
26,Mountain,Montana,983.0,422.0,1060665
28,Mountain,Nevada,7058.0,486.0,3027341
31,Mountain,New Mexico,1949.0,602.0,2092741
44,Mountain,Utah,1904.0,972.0,3153550
50,Mountain,Wyoming,434.0,205.0,577601


In [None]:
# Filter homelessness for cases where the number of family_members is less than one thousand and the region is "Pacific", assigning to fam_lt_1k_pac 
# View the printed result
fam_lt_1k_pac = homelessness[(homelessness['family_members'] < 1000) & (homelessness['region'] == 'Pacific')]
fam_lt_1k_pac

Unnamed: 0,region,state,individuals,family_members,state_pop
1,Pacific,Alaska,1434.0,582.0,735139


In [None]:
# Filter homelessness for cases where the USA census region is "South Atlantic" or it is "Mid-Atlantic", assigning to south_mid_atlantic 
# View the printed result
south_mid_atlantic = homelessness[(homelessness['region'] == 'South Atlantic') | (homelessness['region'] == 'Mid-Atlantic')]
south_mid_atlantic

Unnamed: 0,region,state,individuals,family_members,state_pop
7,South Atlantic,Delaware,708.0,374.0,965479
8,South Atlantic,District of Columbia,3770.0,3134.0,701547
9,South Atlantic,Florida,21443.0,9587.0,21244317
10,South Atlantic,Georgia,6943.0,2556.0,10511131
20,South Atlantic,Maryland,4914.0,2230.0,6035802
30,Mid-Atlantic,New Jersey,6048.0,3350.0,8886025
32,Mid-Atlantic,New York,39827.0,52070.0,19530351
33,South Atlantic,North Carolina,6451.0,2817.0,10381615
38,Mid-Atlantic,Pennsylvania,8163.0,5349.0,12800922
40,South Atlantic,South Carolina,3082.0,851.0,5084156


In [None]:
south_mid_atlantic_isin = homelessness[homelessness.region.isin(['South Atlantic', 'Mid-Atlantic'])]
south_mid_atlantic_isin

# ERROR! The .isin() method retuns a boolean series

Unnamed: 0,region,state,individuals,family_members,state_pop
7,South Atlantic,Delaware,708.0,374.0,965479
8,South Atlantic,District of Columbia,3770.0,3134.0,701547
9,South Atlantic,Florida,21443.0,9587.0,21244317
10,South Atlantic,Georgia,6943.0,2556.0,10511131
20,South Atlantic,Maryland,4914.0,2230.0,6035802
30,Mid-Atlantic,New Jersey,6048.0,3350.0,8886025
32,Mid-Atlantic,New York,39827.0,52070.0,19530351
33,South Atlantic,North Carolina,6451.0,2817.0,10381615
38,Mid-Atlantic,Pennsylvania,8163.0,5349.0,12800922
40,South Atlantic,South Carolina,3082.0,851.0,5084156


In [None]:
# Filter homelessness for cases where the USA census state is in the list of Mojave states, canu, assigning to mojave_homelessness 
# View the printed result
canu = ["California", "Arizona", "Nevada", "Utah"]
mojave_homelessness = homelessness[homelessness['state'].isin(canu)]
mojave_homelessness

Unnamed: 0,region,state,individuals,family_members,state_pop
2,Mountain,Arizona,7259.0,2606.0,7158024
4,Pacific,California,109008.0,20964.0,39461588
28,Mountain,Nevada,7058.0,486.0,3027341
44,Mountain,Utah,1904.0,972.0,3153550


## New columns

In [None]:
# Add a new column to homelessness, named total, containing the sum of the individuals and family_members columns
homelessness['total'] = homelessness['individuals'] + homelessness['family_members']
homelessness['total'].head()

0      3434.0
1      2016.0
2      9865.0
3      2712.0
4    129972.0
Name: total, dtype: float64

In [None]:
# Add another column to homelessness, named p_individuals, containing the proportion of homeless people in each state who are individuals
homelessness['p_individuals'] = homelessness['individuals']/homelessness['total'] 
homelessness['p_individuals'].head()

0    0.748398
1    0.711310
2    0.735834
3    0.840708
4    0.838704
Name: p_individuals, dtype: float64

In [None]:
# Add a column to homelessness, indiv_per_10k, containing the number of homeless individuals per ten thousand people in each state
homelessness['indiv_per_10k'] = (homelessness['individuals']/homelessness['state_pop']) * 10000

In [None]:
# Subset rows where indiv_per_10k is higher than 20, assigning to high_homelessness
high_homelessness = homelessness[homelessness.indiv_per_10k > 20]
high_homelessness

Unnamed: 0,region,state,individuals,family_members,state_pop,total,p_individuals,indiv_per_10k
4,Pacific,California,109008.0,20964.0,39461588,129972.0,0.838704,27.623825
8,South Atlantic,District of Columbia,3770.0,3134.0,701547,6904.0,0.54606,53.738381
11,Pacific,Hawaii,4131.0,2399.0,1420593,6530.0,0.632619,29.079406
28,Mountain,Nevada,7058.0,486.0,3027341,7544.0,0.935578,23.314189
32,Mid-Atlantic,New York,39827.0,52070.0,19530351,91897.0,0.433387,20.392363
37,Pacific,Oregon,11139.0,3337.0,4181886,14476.0,0.769481,26.636307
47,Pacific,Washington,16424.0,5880.0,7523869,22304.0,0.73637,21.829195


In [None]:
# Sort high_homelessness by descending indiv_per_10k, assigning to high_homelessness_srt
high_homelessness_srt = high_homelessness.sort_values('indiv_per_10k', ascending=False)

In [None]:
# Select only the state and indiv_per_10k columns of high_homelessness_srt and save as result 
# Look at the result
result = high_homelessness_srt[['state', 'indiv_per_10k']]
result

Unnamed: 0,state,indiv_per_10k
8,District of Columbia,53.738381
11,Hawaii,29.079406
4,California,27.623825
37,Oregon,26.636307
28,Nevada,23.314189
47,Washington,21.829195
32,New York,20.392363


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=8c91f87e-48d2-4010-b7ff-56410ab49fe5' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>