In [1]:
import pandas as pd
homelessness = pd.read_csv("homelessness.csv")
print(homelessness.head())

               region       state  individuals  family_members  state_pop
0  East South Central     Alabama       2570.0           864.0    4887681
1             Pacific      Alaska       1434.0           582.0     735139
2            Mountain     Arizona       7259.0          2606.0    7158024
3  West South Central    Arkansas       2280.0           432.0    3009733
4             Pacific  California     109008.0         20964.0   39461588


# Sorting
- The first thing you can do is change the order of the rows by sorting them so that the most interesting data is at the top of the DataFrame. You can sort rows using the sort_values method, passing in a column name that you want to sort by.

In [2]:
# Sort homelessness by individuals
homelessness_ind = homelessness.sort_values("individuals")

# Print the top few rows
print(homelessness_ind.head())

                region         state  individuals  family_members  state_pop
50            Mountain       Wyoming        434.0           205.0     577601
34  West North Central  North Dakota        467.0            75.0     758080
7       South Atlantic      Delaware        708.0           374.0     965479
39         New England  Rhode Island        747.0           354.0    1058287
45         New England       Vermont        780.0           511.0     624358


# Sorting in descending order
- Setting the ascending argument to False will sort the data the other way around

In [3]:
# Sort homelessness by descending family members
homelessness_fam = homelessness.sort_values("family_members", ascending=False)
print(homelessness_fam.head())

                region          state  individuals  family_members  state_pop
32        Mid-Atlantic       New York      39827.0         52070.0   19530351
4              Pacific     California     109008.0         20964.0   39461588
21         New England  Massachusetts       6811.0         13257.0    6882635
9       South Atlantic        Florida      21443.0          9587.0   21244317
43  West South Central          Texas      19199.0          6111.0   28628666


# Sorting by multiple variables
- We can sort by multiple variables by passing a list of column names to sort_values.
- To change the direction values are sorted in, pass a list to the ascending argument to specify which direction sorting should be done for each variable.

In [4]:
# Sort homelessness by region, then descending family members
homelessness_reg_fam = homelessness.sort_values(["region", "family_members"], ascending=[True, False])

# Print the top few rows
print(homelessness_fam.head())

                region          state  individuals  family_members  state_pop
32        Mid-Atlantic       New York      39827.0         52070.0   19530351
4              Pacific     California     109008.0         20964.0   39461588
21         New England  Massachusetts       6811.0         13257.0    6882635
9       South Atlantic        Florida      21443.0          9587.0   21244317
43  West South Central          Texas      19199.0          6111.0   28628666


# Subsetting columns
- We may want to zoom in on just one column. We can do this using the name of the DataFrame, followed by square brackets with a column name inside. Here, we can look at just the name column.

In [5]:
# Select the individuals column
individuals= homelessness["individuals"]
print(individuals.head())

0      2570.0
1      1434.0
2      7259.0
3      2280.0
4    109008.0
Name: individuals, dtype: float64


# Subsetting multiple columns
- To select multiple columns, you need two pairs of square brackets. In this code, the inner and outer square brackets are performing different tasks. The outer square brackets are responsible for subsetting the DataFrame, and the inner square brackets are creating a list of column names to subset. This means you could provide a separate list of column names as a variable and then use that list to perform the same subsetting. Usually, it's easier to do in one line.

In [6]:
# Select the state and family_members columns
state_fam = homelessness[["state", "family_members"]]
print(state_fam.head())

        state  family_members
0     Alabama           864.0
1      Alaska           582.0
2     Arizona          2606.0
3    Arkansas           432.0
4  California         20964.0


In [7]:
# Select only the individuals and state columns, in that order
ind_state = homelessness[["individuals","state"]]
print(ind_state.head())

   individuals       state
0       2570.0     Alabama
1       1434.0      Alaska
2       7259.0     Arizona
3       2280.0    Arkansas
4     109008.0  California


# Subsetting rows
- There are lots of different ways to subset rows. The most common way to do this is by creating a logical condition to filter against.
- We can use the logical condition inside of square brackets to subset the rows we're interested in.

In [8]:
# Filter for rows where individuals is greater than 10000
ind_gt_10k = homelessness[homelessness["individuals"]>10000]
print(ind_gt_10k)

                region       state  individuals  family_members  state_pop
4              Pacific  California     109008.0         20964.0   39461588
9       South Atlantic     Florida      21443.0          9587.0   21244317
32        Mid-Atlantic    New York      39827.0         52070.0   19530351
37             Pacific      Oregon      11139.0          3337.0    4181886
43  West South Central       Texas      19199.0          6111.0   28628666
47             Pacific  Washington      16424.0          5880.0    7523869


# Subsetting rows based on text data
- We can also subset rows based on text data. Here, we use the double equal sign in the logical condition to filter
- This also possible substting based on dates.

In [9]:
# Filter for rows where region is Mountain
mountain_reg = homelessness[homelessness["region"] == "Mountain"]
print(mountain_reg)

      region       state  individuals  family_members  state_pop
2   Mountain     Arizona       7259.0          2606.0    7158024
5   Mountain    Colorado       7607.0          3250.0    5691287
12  Mountain       Idaho       1297.0           715.0    1750536
26  Mountain     Montana        983.0           422.0    1060665
28  Mountain      Nevada       7058.0           486.0    3027341
31  Mountain  New Mexico       1949.0           602.0    2092741
44  Mountain        Utah       1904.0           972.0    3153550
50  Mountain     Wyoming        434.0           205.0     577601


# Subsetting based on multiple conditions
- To subset the rows that meet multiple conditions, you can combine conditions using logical operators, such as the "and" operator seen here. This means that only rows that meet both of these conditions will be subsetted. You could also do this in one line of code, but you'll also need to add parentheses around each condition.

In [10]:
# Filter for rows where family_members is less than 1000 and region is Pacific
fam_lt_1k_pac= homelessness[(homelessness["family_members"]<1000) & (homelessness["region"]=="Pacific")]
print(fam_lt_1k_pac)

    region   state  individuals  family_members  state_pop
1  Pacific  Alaska       1434.0           582.0     735139


# Subsetting using .isin()
- If you want to filter on multiple values of a categorical variable, the easiest way is to use the isin method. This takes in a list of values to filter for.

In [11]:
# Subset for rows in South Atlantic or Mid-Atlantic regions
# south_mid_atlantic =homelessness[(homelessness["region"] == "South Atlantic") | (homelessness["region"]=="Mid-Atlantic")]

south_mid_atlantic = homelessness[homelessness["region"].isin(["South Atlantic","Mid-Atlantic"])]
print(south_mid_atlantic)

            region                 state  individuals  family_members  \
7   South Atlantic              Delaware        708.0           374.0   
8   South Atlantic  District of Columbia       3770.0          3134.0   
9   South Atlantic               Florida      21443.0          9587.0   
10  South Atlantic               Georgia       6943.0          2556.0   
20  South Atlantic              Maryland       4914.0          2230.0   
30    Mid-Atlantic            New Jersey       6048.0          3350.0   
32    Mid-Atlantic              New York      39827.0         52070.0   
33  South Atlantic        North Carolina       6451.0          2817.0   
38    Mid-Atlantic          Pennsylvania       8163.0          5349.0   
40  South Atlantic        South Carolina       3082.0           851.0   
46  South Atlantic              Virginia       3928.0          2047.0   
48  South Atlantic         West Virginia       1021.0           222.0   

    state_pop  
7      965479  
8      701547  
9 

In [12]:
# The Mojave Desert states
canu = ["California", "Arizona", "Nevada", "Utah"]

# Filter for rows in the Mojave Desert states
mojave_homelessness = homelessness[homelessness["state"].isin(canu)]
print(mojave_homelessness)

      region       state  individuals  family_members  state_pop
2   Mountain     Arizona       7259.0          2606.0    7158024
4    Pacific  California     109008.0         20964.0   39461588
28  Mountain      Nevada       7058.0           486.0    3027341
44  Mountain        Utah       1904.0           972.0    3153550


# Adding a new column
- Creating and adding new columns can go by many names, including mutating a DataFrame, transforming a DataFrame, and feature engineering.
- On the left-hand side of the equals, we use square brackets with the name of the new column we want to create.
- On the right-hand side, we have the calculation.
- Notice that both the existing column and the new column we just created are in the DataFrame

In [13]:
# Add total col as sum of individuals and family_members
homelessness["total"] = (homelessness["individuals"])+(homelessness["family_members"])

# Add p_individuals col as proportion of total that are individuals
homelessness["p_individuals"] = (homelessness["individuals"])/(homelessness["total"])

print(homelessness.head())

               region       state  individuals  family_members  state_pop  \
0  East South Central     Alabama       2570.0           864.0    4887681   
1             Pacific      Alaska       1434.0           582.0     735139   
2            Mountain     Arizona       7259.0          2606.0    7158024   
3  West South Central    Arkansas       2280.0           432.0    3009733   
4             Pacific  California     109008.0         20964.0   39461588   

      total  p_individuals  
0    3434.0       0.748398  
1    2016.0       0.711310  
2    9865.0       0.735834  
3    2712.0       0.840708  
4  129972.0       0.838704  


# Combo-attack!

In [14]:
# Create indiv_per_10k col as homeless individuals per 10k state pop
homelessness["indiv_per_10k"] = 10000 *(homelessness["state_pop"]) / (homelessness["individuals"])

# Subset rows for indiv_per_10k greater than 20
high_homelessness = homelessness[homelessness["indiv_per_10k"]>20]

# Sort high_homelessness by descending indiv_per_10k
high_homelessness_srt = high_homelessness.sort_values("indiv_per_10k", ascending=False)

# From high_homelessness_srt, select the state and indiv_per_10k cols
result = high_homelessness_srt[["state","indiv_per_10k"]]

# See the result
print(result)

                   state  indiv_per_10k
24           Mississippi   2.911152e+07
46              Virginia   2.164279e+07
49             Wisconsin   2.119491e+07
16                Kansas   2.017574e+07
22              Michigan   1.916696e+07
0                Alabama   1.901821e+07
13              Illinois   1.884341e+07
15                  Iowa   1.840221e+07
18             Louisiana   1.834524e+07
14               Indiana   1.773172e+07
48         West Virginia   1.767180e+07
35                  Ohio   1.685141e+07
44                  Utah   1.656276e+07
40        South Carolina   1.649629e+07
17              Kentucky   1.631135e+07
34          North Dakota   1.623298e+07
25              Missouri   1.621193e+07
29         New Hampshire   1.620916e+07
33        North Carolina   1.609303e+07
38          Pennsylvania   1.568164e+07
6            Connecticut   1.566456e+07
10               Georgia   1.513918e+07
43                 Texas   1.491154e+07
30            New Jersey   1.469250e+07
