# Recommendation Model for Cityspire

This model will create a list of suggested city_states based off of user preferrences
1. base model will create list based off of a city_state
  - "Newark, New Jersey"
2. final model will create list based off of user defined preferrences
  - population size
  - crime rate
  - rental rate
  - walk score


## Base Model

In [None]:
import pandas as pd

In [None]:
# load in dataset of merged predicted values of population, crime rate, rental rate and walk score
URL = "https://raw.githubusercontent.com/JeffreyAsuncion/LambdaLabs/main/cityspire-a-ds/Datasets_csv/pop_cc_rr_ws.csv"
df = pd.read_csv(URL)
df.head()

Unnamed: 0,city_state,id_num,population,crime_rate,rental_rate,walk_score
0,"El Dorado Hills, California",0,46192.0,46.36,2477.6,12.0
1,"Eldridge, California",1,1208.0,46.36,2477.6,44.5
2,"El Granada, California",2,6695.0,46.36,2477.6,44.5
3,"Elizabeth Lake, California",3,1941.0,46.36,2477.6,44.5
4,"Elk Creek, California",4,239.0,46.36,2477.6,44.5


In [None]:
df[df["city_state"] == "Newark, New Jersey"]

Unnamed: 0,city_state,id_num,population,crime_rate,rental_rate,walk_score
18127,"Newark, New Jersey",17089,283945.0,27.4,1466.89,79.0


In [None]:
df.isnull().count()

city_state     29626
id_num         29626
population     29626
crime_rate     29626
rental_rate    29626
walk_score     29626
dtype: int64

In [None]:
df.shape

(29626, 6)

In [None]:
from sklearn.neighbors import NearestNeighbors

state_id = 18127 # Newark, New Jersey	17089	283945.0	27.4	1466.89	79.0

# Instantiate and fit knn to the correct columns
NN = NearestNeighbors(n_neighbors=10, algorithm='ball_tree')

# [2:] is to ignore the 1st 2 columns 'city_state', 'id'
NN.fit(df[df.columns[2:]])

# take 'state id' as INPUT
state_index = df.index[df['id_num'] == state_id]

# use 'state_id' to find state features
state_features = df.iloc[state_index, 2:].to_numpy()

dist, indices = NN.kneighbors(state_features)

recommended_list = list(df.loc[indices[0], 'id_num'])
recommended_list

[18127, 17995, 18362, 6809, 25360, 25290, 24503, 6978, 24321, 12237]

In [None]:
results = []
for i in range(len(recommended_list)):
  r_list = df[df['id_num']==recommended_list[i]]
  r = r_list.to_dict('records')
  results.append(r)

In [None]:
results[0]

[{'city_state': 'Garden City South, New York',
  'crime_rate': 22.9,
  'id_num': 18127,
  'population': 4442.0,
  'rental_rate': 1361.25,
  'walk_score': 48.41}]

In [None]:
results[1]

[{'city_state': 'Dansville, New York',
  'crime_rate': 22.9,
  'id_num': 17995,
  'population': 4427.0,
  'rental_rate': 1361.25,
  'walk_score': 48.41}]

In [None]:
results[2]

[{'city_state': 'Manlius, New York',
  'crime_rate': 22.9,
  'id_num': 18362,
  'population': 4463.0,
  'rental_rate': 1361.25,
  'walk_score': 48.41}]

In [None]:
results[3]

[{'city_state': 'Buhl, Idaho',
  'crime_rate': 22.1,
  'id_num': 6809,
  'population': 4446.0,
  'rental_rate': 1343.1,
  'walk_score': 33.77}]

base model as a function

In [None]:
def base_model(state_id : int):
    
    # Instantiate and fit knn to the correct columns
    NN = NearestNeighbors(n_neighbors=10, algorithm='ball_tree')

    # [2:] is to ignore the 1st 2 columns 'city_state', 'id'
    NN.fit(df[df.columns[2:]])

    # take 'state id' as INPUT
    state_index = df.index[df['id_num'] == state_id]

    # use 'state_id' to find state features
    state_features = df.iloc[state_index, 2:].to_numpy()

    dist, indices = NN.kneighbors(state_features)

    recommended_list = list(df.loc[indices[0], 'id_num'])

    results = []
    for i in range(len(recommended_list)):
      r_list = df[df['id_num']==recommended_list[i]]
      r = r_list.to_dict('records')
      results.append(r)

    return results

In [None]:
results = base_model(18127)

In [None]:
results

[[{'city_state': 'Garden City South, New York',
   'crime_rate': 22.9,
   'id_num': 18127,
   'population': 4442.0,
   'rental_rate': 1361.25,
   'walk_score': 48.41}],
 [{'city_state': 'Dansville, New York',
   'crime_rate': 22.9,
   'id_num': 17995,
   'population': 4427.0,
   'rental_rate': 1361.25,
   'walk_score': 48.41}],
 [{'city_state': 'Manlius, New York',
   'crime_rate': 22.9,
   'id_num': 18362,
   'population': 4463.0,
   'rental_rate': 1361.25,
   'walk_score': 48.41}],
 [{'city_state': 'Buhl, Idaho',
   'crime_rate': 22.1,
   'id_num': 6809,
   'population': 4446.0,
   'rental_rate': 1343.1,
   'walk_score': 33.77}],
 [{'city_state': 'Whiteville, Tennessee',
   'crime_rate': 7.3,
   'id_num': 25360,
   'population': 4449.0,
   'rental_rate': 1376.53,
   'walk_score': 21.78}],
 [{'city_state': 'Selmer, Tennessee',
   'crime_rate': 42.5,
   'id_num': 25290,
   'population': 4426.0,
   'rental_rate': 1376.53,
   'walk_score': 21.78}],
 [{'city_state': 'Surfside Beach, South

## Recommendation Model with user defined features

### Pickled Model and dataset

In [None]:
import pickle

In [None]:
# Pickel Dataset states
with open('states_dataset.pkl', 'wb') as pickle_file:
    pickle.dump(df, pickle_file)

In [None]:
# save the model to disk
filename = 'recommendation_model.sav'
pickle.dump(NN, open(filename, 'wb'))

In [None]:
filename = '/content/recommendation_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))
states_pkl = pd.read_pickle("/content/states_dataset.pkl")


def suggest_state_ids(state_id: int):
    # state_id = 18127 # Newark, New Jersey	17089	283945.0	27.4	1466.89	79.0

    # # take 'state id' as INPUT
    state_index = states_pkl.index[states_pkl['id_num'] == state_id]

    # # use 'song_track_id' to find audio features
    state_features = states_pkl.iloc[state_index, 2:].to_numpy()

    dist, indices = loaded_model.kneighbors(state_features)

    recommended_list = list(states_pkl.loc[indices[0], 'id_num'])

    results = []
    for i in range(len(recommended_list)):
      r_list = states_pkl[states_pkl['id_num']==recommended_list[i]]
      r = r_list.to_dict('records')
      results.append(r)
    
    return results

In [None]:
results = suggest_state_ids(18127)
results

[[{'city_state': 'Garden City South, New York',
   'crime_rate': 22.9,
   'id_num': 18127,
   'population': 4442.0,
   'rental_rate': 1361.25,
   'walk_score': 48.41}],
 [{'city_state': 'Dansville, New York',
   'crime_rate': 22.9,
   'id_num': 17995,
   'population': 4427.0,
   'rental_rate': 1361.25,
   'walk_score': 48.41}],
 [{'city_state': 'Manlius, New York',
   'crime_rate': 22.9,
   'id_num': 18362,
   'population': 4463.0,
   'rental_rate': 1361.25,
   'walk_score': 48.41}],
 [{'city_state': 'Buhl, Idaho',
   'crime_rate': 22.1,
   'id_num': 6809,
   'population': 4446.0,
   'rental_rate': 1343.1,
   'walk_score': 33.77}],
 [{'city_state': 'Whiteville, Tennessee',
   'crime_rate': 7.3,
   'id_num': 25360,
   'population': 4449.0,
   'rental_rate': 1376.53,
   'walk_score': 21.78}],
 [{'city_state': 'Selmer, Tennessee',
   'crime_rate': 42.5,
   'id_num': 25290,
   'population': 4426.0,
   'rental_rate': 1376.53,
   'walk_score': 21.78}],
 [{'city_state': 'Surfside Beach, South

### Modify Base model use the 4 state features instead of the state id

In [None]:

def suggest_state_ids2(population:float, crime_rate:float, rental_rate:float, walk_score:float):

    # this is to convert user input into a dataframe
    state_id = 30000  # this is a dummy value

    # here we make a new dataframe based off the user preferrences
    # dataframe is has the shape (1, 6) - one row, 6 columns
    d = {"city_state": "user_def",
        "id_num": 30000, 
        "population" : population, 
        "crime_rate" : crime_rate, 
        "rental_rate": rental_rate, 
        "walk_score" : walk_score}
    dfa = pd.DataFrame([d])

    # take 'state id' as INPUT
    state_index = dfa.index[dfa['id_num'] == state_id]

    # use 'state_id' to find state features
    state_features = dfa.iloc[state_index, 2:].to_numpy()

    dist, indices = loaded_model.kneighbors(state_features)

    recommended_list = list(states_pkl.loc[indices[0], 'id_num'])

    results = []
    for i in range(len(recommended_list)):
      r_list = states_pkl[states_pkl['id_num']==recommended_list[i]]
      r = r_list.to_dict('records')
      results.append(r)
    
    return results

In [None]:
a = suggest_state_ids2(10000, 35, 1500, 40)
a

[[{'city_state': 'Fairmont, Minnesota',
   'crime_rate': 18.49,
   'id_num': 14576,
   'population': 10016.0,
   'rental_rate': 1482.9,
   'walk_score': 28.68}],
 [{'city_state': 'Plymouth, Indiana',
   'crime_rate': 22.4,
   'id_num': 8859,
   'population': 10012.0,
   'rental_rate': 1466.89,
   'walk_score': 33.0}],
 [{'city_state': 'Havre, Montana',
   'crime_rate': 36.1,
   'id_num': 15746,
   'population': 10021.0,
   'rental_rate': 1466.89,
   'walk_score': 44.0}],
 [{'city_state': 'Pájaros comunidad, Puerto Rico',
   'crime_rate': 27.71,
   'id_num': 29354,
   'population': 9978.0,
   'rental_rate': 1466.89,
   'walk_score': 35.57}],
 [{'city_state': 'Lakes, Alaska',
   'crime_rate': 35.82,
   'id_num': 1732,
   'population': 9992.0,
   'rental_rate': 1466.89,
   'walk_score': 17.5}],
 [{'city_state': 'Elgin, Texas',
   'crime_rate': 21.83,
   'id_num': 25812,
   'population': 10037.0,
   'rental_rate': 1481.18,
   'walk_score': 30.68}],
 [{'city_state': 'Bonham, Texas',
   'cri