<h1>Investment clustering with K-Means learning algorithm<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

import plotly.express as px
from skimpy import skim

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

import dash
import dash_bootstrap_components as dbc
from dash.dependencies import Output, Input 
from dash import html, dcc
from dash.exceptions import PreventUpdate

In [2]:
df = pd.read_csv('/Users/alexfil/Desktop/git_hub/dash/friday_plotly/week_30/rural-investments.csv',
                 thousands=',')

In [3]:
df.head(3)

Unnamed: 0,Fiscal Year,State Name,County,Congressional District,Program Area,Program,Zip Code,Svi Status,Borrower Name,Project Name,Investment Type,City,Lender Name,Funding Code,NAICS Industry Sector,County FIPS,NAICS Code,NAICS Description,Project Announced Description,Investment Dollars,Number of Investments
0,2024,Delaware,Kent,At-Large,Business Programs,Rural Energy for America Program Renewable Ene...,19904,Socially Vulnerable,905 DOVER LLC,FY23 REAP - 2ND APP,Grant Funds,Dover,Not Applicable,REAP IRA RES GRANT UNRESTRICTED (FY 22/23),Real Estate and Rental and Leasing (53),10001,531120,LESSORS OF NONRESIDENTIAL BUILDINGS (EXCEPT MI...,This Rural Development investment will be used...,62871,1
1,2024,Delaware,Kent,At-Large,Business Programs,Rural Energy for America Program Renewable Ene...,19962,Not Socially Vulnerable,BLUE ORCHID PROPERTIES LLC,FY24 REAP,Grant Funds,Magnolia,Not Applicable,REAP IRA RES GRANT UNRESTRICTED (FY 22/23),Real Estate and Rental and Leasing (53),10001,531210,OFFICES OF REAL ESTATE AGENTS AND BROKERS,This Rural Development Investment will be used...,33534,1
2,2024,Delaware,Kent,At-Large,Business Programs,Rural Energy for America Program Renewable Ene...,19977,Not Socially Vulnerable,"DESTORAGE.COM CLAYTON, LLC",FY23 REAP - 2ND APP,Grant Funds,Smyrna,Not Applicable,REAP IRA RES GRANT UNRESTRICTED (FY 22/23),Other or Not Defined,10001,958160,Not Available,This Rural Development investment will be used...,139950,1


In [4]:
states = pd.read_csv('/Users/alexfil/Desktop/git_hub/dash/friday_plotly/week_30/states.csv', sep=',')
states[' NAME'] = states[' NAME'].str.replace(' "', '').str.replace('"', '')
states = states.rename(columns={' NAME': 'State Name', 'ABBREVIATION': 'State Code'})

In [5]:
new_df = df.filter(items=['State Name', 'Investment Type', 'Investment Dollars'])

In [6]:
new_df = pd.merge(new_df, states[['State Name','State Code']], on='State Name', how='left')

In [7]:
new_df.head(5)

Unnamed: 0,State Name,Investment Type,Investment Dollars,State Code
0,Delaware,Grant Funds,62871,DE
1,Delaware,Grant Funds,33534,DE
2,Delaware,Grant Funds,139950,DE
3,Delaware,Grant Funds,264753,DE
4,Delaware,Grant Funds,206489,DE


In [8]:
new_df.columns = [c.lower().replace(' ', '_') for c in new_df.columns]

In [9]:
new_df.head(5)

Unnamed: 0,state_name,investment_type,investment_dollars,state_code
0,Delaware,Grant Funds,62871,DE
1,Delaware,Grant Funds,33534,DE
2,Delaware,Grant Funds,139950,DE
3,Delaware,Grant Funds,264753,DE
4,Delaware,Grant Funds,206489,DE


In [20]:
app = dash.Dash(external_stylesheets=[dbc.themes.BOOTSTRAP])

app.layout = dbc.Container([
    html.Div([
    html.H1('Rural investments in the US in 2024'),
    html.Br()
    ]),
    html.Div([
    dbc.Row([
        dbc.Col([
            dbc.Label('Clusters Selection:', className='date-group-labels'),
            dcc.Dropdown(id='clusters_dropdown',
                         value=2,
                         options=[{'label': cluster, 'value': cluster}
                                  for cluster in range(2, 16)])
            
            ]),
        dbc.Col([
            dbc.Label('Investment Type:', className='date-group-labels'),
            dcc.Dropdown(id='type_dropdown',
                         value='Grant Funds',
                         options=[{'label': inv_type, 'value': str(inv_type)}
                                  for inv_type in new_df['investment_type'].unique()])
            
            ])
    ])]),
    html.Br(),
    html.Div([
    html.H2('Investments clustering with K-Means learning algorithm'),
    
    dbc.Row([
        dcc.Graph(id='states_clusters')
    ]
    )
    ])
   
   ])    

In [21]:
@app.callback(Output('states_clusters', 'figure'),
              Input('clusters_dropdown', 'value'),
              Input('type_dropdown', 'value')
             )
def clustered_map(n_clusters, inv_type):
    query = new_df.query('investment_type == @inv_type')
    data = query['investment_dollars'].values.reshape(-1, 1)
    kmeans = KMeans(n_clusters=n_clusters)
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    kmeans.fit(scaled_data)
    
    fig = px.choropleth(query,
                        locations='state_code',
                        locationmode='USA-states',
                        color=[str(x) for x in kmeans.labels_],
                        labels={'color': 'Clusters'},
                        scope='usa',
                        hover_name='state_name',
#                        hover_data=indicators,
                        height=800,
                        title=f'Number of states clusters: {n_clusters}<br>Inertia: {kmeans.inertia_:,.2f}',
                        color_discrete_sequence=px.colors.qualitative.T10
                       )
    fig.layout.geo.showframe = False
    fig.layout.geo.landcolor = 'white'
    fig.layout.geo.bgcolor = 'whitesmoke'
    fig.layout.paper_bgcolor = 'whitesmoke'
    fig.layout.geo.countrycolor = 'gray'
    fig.layout.geo.coastlinecolor = 'gray'
    return fig

In [22]:
if __name__ == '__main__':
    app.run_server(debug=True, port=8000, host='127.0.0.1', use_reloader=False)

Dash is running on http://127.0.0.1:8000/

Dash is running on http://127.0.0.1:8000/

Dash is running on http://127.0.0.1:8000/

Dash is running on http://127.0.0.1:8000/

 * Serving Flask app '__main__'
 * Debug mode: on
