In [1]:
import pandas as pd
import datetime
import plotly.express as px

In [8]:
def wrangle(file,services=10):
    df = pd.read_csv(file, parse_dates=["date"], index_col="date")
    df = df[~df.index.isna()]

    df["zipcode"] = [i.split()[-3].rstrip(",") for i in df["address"]]
    zips = [i.isnumeric() for i in df["zipcode"]]
    df = df[(zips)]
    df["zipcode"] = df["zipcode"].astype("int64")

    df["month"] = [i.strftime("%Y-%m") for i in df.index]

    top_services = list(df["service_description"].value_counts()[:services].index)
    for i,v in enumerate(df["service_description"]):
        if v not in top_services:
            df["service_description"][i] = "Other"

    return df
data = wrangle("../data/mock_data.csv",10)
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 996 entries, 2018-05-14 to 2018-03-27
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   client_id            996 non-null    int64  
 1   full_name            996 non-null    object 
 2   gender               996 non-null    object 
 3   age                  996 non-null    int64  
 4   service_description  996 non-null    object 
 5   address              996 non-null    object 
 6   latitude             996 non-null    float64
 7   longitude            996 non-null    float64
 8   zipcode              996 non-null    int64  
 9   month                996 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 85.6+ KB




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [28]:
def service_splits(categories,demographic,norm=False):
    """
    Splits a categorical series and finds the frequency of each unique
    value crossed with the values of a specified demographic series.
    Returns demographic counts of each service into a bar graph.
    """
    return px.bar(pd.crosstab(categories, demographic),
                  labels={"service_description": "Services",
                          "value": "Counts",
                          demographic.name: demographic.name.capitalize()})

services = data["service_description"]
demo = data["gender"]
service_splits(services,demo)

In [6]:
def services_by_age(ages):
    """
    Splits a series of ages into age groups,
    gets the frequency of each age,
    then plots it into a pie chart.
    """
    age_groups = {}
    age_groups["<18"]   = (ages < 18).sum()
    age_groups["18-24"] = ((ages >= 18) & (ages < 25)).sum()
    age_groups["25-34"] = ((ages >= 25) & (ages < 35)).sum()
    age_groups["35-44"] = ((ages >= 35) & (ages < 45)).sum()
    age_groups["45-54"] = ((ages >= 45) & (ages < 55)).sum()
    age_groups["55+"]   = (ages >= 55).sum()

    age_groups = pd.Series(age_groups)

    fig = px.pie(age_groups,
                 values=age_groups,
                 names=age_groups.index,
                 labels={"index": "Age Group"},
                 title="Service Counts by Age Group",
                 )
    fig.update_traces(textinfo='percent+label')

    return fig.show()

services_by_age(data["age"])

In [7]:
fig = px.bar(pd.crosstab(data["month"],data["service_description"]),
             labels={"month": "Month", "value": "Service Counts", "service_description": "Services"},
             title="Service Frequency by Month")

fig.show()

In [265]:
# pd.Series([i.split(", ")[1] for i in data["address"]])

In [6]:
map = pd.read_csv("../data/services_by-zipcode.csv")
map.head()

Unnamed: 0,zipcode,counts,city,latitude,longitude
0,99201,16,Spokane,47.663945,-117.43185
1,99217,12,Spokane,47.707542,-117.33776
2,99021,9,Mead,47.81108,-117.22306
3,99037,8,Spokane Valley,47.643346,-117.19993
4,99202,7,Spokane,47.656692,-117.37899


In [9]:
fig = px.scatter_mapbox(map, lat="latitude", lon='longitude',
                     color="city", # which column to use to set the color of markers
                     hover_name="zipcode", # column added to hover information
                     size='counts',
                     zoom=10,
                     )

fig.update_layout(mapbox_style="open-street-map")
fig.show()