In [2]:
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

file_path = '/Users/alyssaesca/Downloads/expenditures_and_PIT.csv'
data = pd.read_csv(file_path)

X = data.drop(columns=['Program'])  
y = data['Unsheltered PEH']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def lasso_analysis(X, y, standardized=False):
    if standardized:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)

lasso = Lasso(alpha=0.1)  
lasso.fit(X, y)

coef_df = pd.DataFrame({'program': X.columns, 'Coefficient': lasso.coef_})
coef_df = coef_df.sort_values(by='Coefficient', ascending=False)

top_3 = coef_df.head(3)
bottom_3 = coef_df.tail(3)
    
return top_3, bottom_3

top_3_unstandardized, bottom_3_unstandardized = lasso_analysis(X_train, y_train, standardized=False)
top_3_standardized, bottom_3_standardized = lasso_analysis(X_train, y_train, standardized=True)

print("Top 3 Programs (Unstandardized):\n", top_3_unstandardized)
print("Bottom 3 Programs (Unstandardized):\n", bottom_3_unstandardized)
print("Top 3 Programs (Standardized):\n", top_3_standardized)
print("Bottom 3 Programs (Standardized):\n", bottom_3_standardized)

ValueError: could not convert string to float: 'El Cajon'

In [8]:
# had issues because columns contain non numeric values (city names)
# going to try lasso again using only numeric columns

import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

file_path = '/Users/alyssaesca/Downloads/expenditures_and_PIT.csv'
data = pd.read_csv(file_path)

data = pd.get_dummies(data, columns=['Program'], prefix='', prefix_sep='')

X = data.drop(columns=['Unsheltered PEH', 'City', 'Grantee', 'ExpenditureType'])  
y = data['Unsheltered PEH']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def lasso_analysis(X, y, standardized=False):
    if standardized:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
    
    lasso = Lasso(alpha=0.1)  
    lasso.fit(X, y)
    
    coef_df = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': lasso.coef_})
    coef_df = coef_df.sort_values(by='Coefficient', ascending=False)
    
    program_coef_df = coef_df[coef_df['Feature'].str.contains('|'.join(data.columns[data.columns.str.startswith("Program_")]))]
    
    top_3 = program_coef_df.head(3)
    bottom_3 = program_coef_df.tail(3)
    
    return top_3, bottom_3

# unstandardized data
top_3_unstandardized, bottom_3_unstandardized = lasso_analysis(X_train, y_train, standardized=False)

# standardized data
top_3_standardized, bottom_3_standardized = lasso_analysis(X_train, y_train, standardized=True)

print("Top 3 Programs (Unstandardized):\n", top_3_unstandardized)
print("Bottom 3 Programs (Unstandardized):\n", bottom_3_unstandardized)
print("Top 3 Programs (Standardized):\n", top_3_standardized)
print("Bottom 3 Programs (Standardized):\n", bottom_3_standardized)


Top 3 Programs (Unstandardized):
               Feature  Coefficient
26     service center    38.354318
20           outreach    12.099532
23  rental assistance     5.789037
Bottom 3 Programs (Unstandardized):
                         Feature  Coefficient
10           food and nutrition    -3.878115
29         transitional housing   -24.314196
15  housing navigation services   -58.974972
Top 3 Programs (Standardized):
                    Feature  Coefficient
1                Total PEH   686.554210
2               Population    64.120278
4  Unsheltered Per 100,000    60.730896
Bottom 3 Programs (Standardized):
                         Feature  Coefficient
15  housing navigation services    -5.619289
29         transitional housing    -6.250595
3               PEH Per 100,000   -85.170438


In [14]:
# noticed that it wasn't displaying program names for the standardized data

import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('/Users/alyssaesca/Downloads/expenditures_and_PIT.csv')

data = pd.get_dummies(data, columns=['Program'], prefix='', prefix_sep='')

# making sure analysis focuses on program column
X = data.drop(columns=['City', 'Year', 'Total PEH', 'Unsheltered PEH', 'Population', 'PEH Per 100,000', 
                       'Unsheltered Per 100,000', 'Grantee', 'Amount', 'ExpenditureType'])
y = data['Unsheltered PEH']

feature_names = X.columns

def lasso_analysis(X, y, feature_names, standardized=False):
    if standardized:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
    
    lasso = Lasso(alpha=0.1, max_iter=10000)
    lasso.fit(X, y)
    
    coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': lasso.coef_})
    
    program_coef_df = coef_df[coef_df['Feature'].isin(data.columns) & (coef_df['Coefficient'] != 0)]
    
    program_coef_df = program_coef_df.sort_values(by='Coefficient', ascending=False)
    
    top_3 = program_coef_df.head(3)
    bottom_3 = program_coef_df.tail(3)
    
    return top_3, bottom_3

# unstandardized data
top_3_unstandardized, bottom_3_unstandardized = lasso_analysis(X, y, feature_names, standardized=False)

# standardized data
top_3_standardized, bottom_3_standardized = lasso_analysis(X, y, feature_names, standardized=True)

print("Top 3 Programs (Unstandardized):\n", top_3_unstandardized)
print("Bottom 3 Programs (Unstandardized):\n", bottom_3_unstandardized)
print("Top 3 Programs (Standardized):\n", top_3_standardized)
print("Bottom 3 Programs (Standardized):\n", bottom_3_standardized)


Top 3 Programs (Unstandardized):
                  Feature  Coefficient
23  transitional housing  1918.500143
19          safe parking  1640.630989
20        service center  1461.921088
Bottom 3 Programs (Unstandardized):
                        Feature  Coefficient
10  housing stability services  -143.526207
4           food and nutrition  -151.294030
3               flexible funds  -163.227487
Top 3 Programs (Standardized):
                          Feature  Coefficient
23          transitional housing   363.265519
1              emergency shelter   149.870867
2   family reunification program   142.230361
Bottom 3 Programs (Standardized):
                    Feature  Coefficient
6  homelessness prevention   -13.157850
3           flexible funds   -15.058831
4       food and nutrition   -16.742338


In [15]:
# the coefficients are really high so i want to redo the analysis to make sure its accurate
# also going to try unsheltered per 100,000 as response variable
import pandas as pd
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('/Users/alyssaesca/Downloads/expenditures_and_PIT.csv')

data = pd.get_dummies(data, columns=['Program'], prefix='', prefix_sep='')

X = data.drop(columns=['City', 'Year', 'Total PEH', 'Unsheltered PEH', 'Population', 
                       'PEH Per 100,000', 'Unsheltered Per 100,000', 'Grantee', 'Amount', 'ExpenditureType'])
y = data['Unsheltered Per 100,000']  

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

lasso_cv = LassoCV(cv=5, max_iter=10000, random_state=42)
lasso_cv.fit(X_scaled, y)

feature_names = X.columns
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': lasso_cv.coef_})

program_coef_df = coef_df[coef_df['Coefficient'] != 0].sort_values(by='Coefficient', ascending=False)

top_3 = program_coef_df.head(3)
bottom_3 = program_coef_df.tail(3)

print("Optimal alpha (regularization strength):", lasso_cv.alpha_)
print("Top 3 Programs:\n", top_3)
print("Bottom 3 Programs:\n", bottom_3)


Optimal alpha (regularization strength): 0.43794736930676464
Top 3 Programs:
                          Feature  Coefficient
1              emergency shelter    19.787320
23          transitional housing    11.771687
2   family reunification program    10.990542
Bottom 3 Programs:
               Feature  Coefficient
5   homeless services    -1.693695
3      flexible funds    -2.771360
11      motel voucher    -4.892710


In [16]:
# this looks better but i want to compare standardized and unstandardized data
# also going to try to format the results so they are more clear
import pandas as pd
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('/Users/alyssaesca/Downloads/expenditures_and_PIT.csv')

data = pd.get_dummies(data, columns=['Program'], prefix='', prefix_sep='')

X = data.drop(columns=['City', 'Year', 'Total PEH', 'Unsheltered PEH', 'Population', 
                       'PEH Per 100,000', 'Unsheltered Per 100,000', 'Grantee', 'Amount', 'ExpenditureType'])
y = data['Unsheltered Per 100,000']  

feature_names = X.columns

# standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

lasso_cv_unstandardized = LassoCV(cv=5, max_iter=10000, random_state=42)
lasso_cv_unstandardized.fit(X, y)

lasso_cv_standardized = LassoCV(cv=5, max_iter=10000, random_state=42)
lasso_cv_standardized.fit(X_scaled, y)

coef_df_unstandardized = pd.DataFrame({
    'Program': feature_names, 
    'Coefficient': lasso_cv_unstandardized.coef_
})
coef_df_unstandardized = coef_df_unstandardized[coef_df_unstandardized['Coefficient'] != 0]
coef_df_unstandardized = coef_df_unstandardized.sort_values(by='Coefficient', ascending=False)

coef_df_standardized = pd.DataFrame({
    'Program': feature_names, 
    'Coefficient': lasso_cv_standardized.coef_
})
coef_df_standardized = coef_df_standardized[coef_df_standardized['Coefficient'] != 0]
coef_df_standardized = coef_df_standardized.sort_values(by='Coefficient', ascending=False)

# top and bottom 3 programs for both standardized & unstandardized
top_3_unstandardized = coef_df_unstandardized.head(3)
bottom_3_unstandardized = coef_df_unstandardized.tail(3)

top_3_standardized = coef_df_standardized.head(3)
bottom_3_standardized = coef_df_standardized.tail(3)

print("Top 3 Programs (Unstandardized):\n", top_3_unstandardized)
print("Bottom 3 Programs (Unstandardized):\n", bottom_3_unstandardized)
print("\nTop 3 Programs (Standardized):\n", top_3_standardized)
print("Bottom 3 Programs (Standardized):\n", bottom_3_standardized)

# formatting the results as tables
def format_output(title, df):
    print(f"{title}")
    print("| Program  | Coefficient |")
    print("|----------|-------------|")
    for _, row in df.iterrows():
        print(f"| {row['Program']} | {row['Coefficient']:.3f} |")
    print("\n")

format_output("Top 3 Programs (Unstandardized)", top_3_unstandardized)
format_output("Bottom 3 Programs (Unstandardized)", bottom_3_unstandardized)
format_output("Top 3 Programs (Standardized)", top_3_standardized)
format_output("Bottom 3 Programs (Standardized)", bottom_3_standardized)


Top 3 Programs (Unstandardized):
                          Program  Coefficient
9    housing navigation services    83.413257
2   family reunification program    81.637574
23          transitional housing    55.348228
Bottom 3 Programs (Unstandardized):
               Program  Coefficient
17  rental assistance    -5.143102
5   homeless services    -6.439767
11      motel voucher   -13.940529

Top 3 Programs (Standardized):
                          Program  Coefficient
1              emergency shelter    19.787320
23          transitional housing    11.771687
2   family reunification program    10.990542
Bottom 3 Programs (Standardized):
               Program  Coefficient
5   homeless services    -1.693695
3      flexible funds    -2.771360
11      motel voucher    -4.892710
Top 3 Programs (Unstandardized)
| Program  | Coefficient |
|----------|-------------|
| housing navigation services | 83.413 |
| family reunification program | 81.638 |
| transitional housing | 55.348 |


Bottom 3