In [317]:
import pandas as pd

# Simple Content based recommender

In [321]:
data = pd.read_csv("geoplaces2.csv", sep=";")
data.head()

Unnamed: 0,placeID,latitude,longitude,the_geom_meter,name,address,city,state,country,fax,...,alcohol,smoking_area,dress_code,accessibility,price,url,ambience,franchise,area,other_services
0,134999,18.915421,-99.184871,0101000020957F000088568DE356715AC138C0A525FC46...,Kiku Cuernavaca,Revolucion,Cuernavaca,Morelos,Mexico,?,...,no_alcohol,none,informal,no_accessibility,medium,kikucuernavaca.com.mx,familiar,f,closed,none
1,132825,22.147392,-100.983092,0101000020957F00001AD016568C4858C1243261274BA5...,puesto de tacos,esquina santos degollado y leon guzman,s.l.p.,s.l.p.,mexico,?,...,no_alcohol,none,informal,completely,low,?,familiar,f,open,none
2,135106,22.149709,-100.976093,0101000020957F0000649D6F21634858C119AE9BF528A3...,El Rincón de San Francisco,Universidad 169,San Luis Potosi,San Luis Potosi,Mexico,?,...,wine_beer,bar,informal,partially,medium,?,familiar,f,open,none
3,132667,23.752697,-99.163359,0101000020957F00005D67BCDDED8157C1222A2DC8D84D...,little pizza Emilio Portes Gil,calle emilio portes gil,victoria,tamaulipas,?,?,...,no_alcohol,none,informal,completely,low,?,familiar,t,closed,none
4,132613,23.752903,-99.165076,0101000020957F00008EBA2D06DC8157C194E03B7B504E...,carnitas_mata,lic. Emilio portes gil,victoria,Tamaulipas,Mexico,?,...,no_alcohol,permitted,informal,completely,medium,?,familiar,t,closed,none


In [322]:
# Remove unwanted features
data = data[["placeID", "alcohol", "smoking_area", "dress_code", "price", "ambience"]]
data

Unnamed: 0,placeID,alcohol,smoking_area,dress_code,price,ambience
0,134999,no_alcohol,none,informal,medium,familiar
1,132825,no_alcohol,none,informal,low,familiar
2,135106,wine_beer,bar,informal,medium,familiar
3,132667,no_alcohol,none,informal,low,familiar
4,132613,no_alcohol,permitted,informal,medium,familiar
...,...,...,...,...,...,...
125,132866,no_alcohol,none,informal,medium,familiar
126,135072,no_alcohol,none,informal,medium,familiar
127,135109,wine_beer,none,informal,medium,quiet
128,135019,no_alcohol,none,informal,low,familiar


Looking at the data, I believe the different values can be well ordered in the following way:
### Smoking
none < area < bar < permitted
### Dress code
casual < informal < formal
### Alcohol
no < limited < full bar
# Price
low < medium < high

Ambience will also be label encoded. Ambience should be one hot encoded, since there is no real order in ambiances but there are only 2 so it doesn't matter. Difference will be either 0 or 1 in any case.

In [323]:
from sklearn.preprocessing import LabelEncoder

# One hot encoding for smoking area
data.join(pd.get_dummies(data["smoking_area"], prefix='smoking_area', columns=["no_smoking"]))

# Label encoding for alcohol
alcoholEncoder = LabelEncoder()
alcoholEncoder.fit(["no_alcohol", "wine_beer", "full_bar"])
data["alcohol"] = alcoholEncoder.transform(data["alcohol"])

# Label encoding for smoking area
smokingEncoder = LabelEncoder()
smokingEncoder.fit(["none", "area", "bar", "permitted"])
data["smoking_area"] = smokingEncoder.transform(data["smoking_area"])

# Label encoding for dress code
dressEncoder = LabelEncoder()
dressEncoder.fit(["casual", "informal", "formal"])
data["dress_code"] = dressEncoder.transform(data["dress_code"])

# Label encoding for price
priceEncoder = LabelEncoder()
priceEncoder.fit(["low", "medium", "high"])
data["price"] = priceEncoder.transform(data["price"])

# Label encoding for ambience
ambienceEncoder = LabelEncoder()
ambienceEncoder.fit(["quiet", "familiar"])
data["ambience"] = ambienceEncoder.transform(data["ambience"])

data

Unnamed: 0,placeID,alcohol,smoking_area,dress_code,price,ambience
0,134999,1,2,2,2,0
1,132825,1,2,2,1,0
2,135106,2,1,2,2,0
3,132667,1,2,2,1,0
4,132613,1,3,2,2,0
...,...,...,...,...,...,...
125,132866,1,2,2,2,0
126,135072,1,2,2,2,0
127,135109,2,2,2,2,1
128,135019,1,2,2,1,0


In [324]:
def find_similar_places(df, place_id, max_error):
    matches = []
    # Get first row with matching placeID
    target_row = df[df['placeID'] == place_id].iloc[0]
    for index, row in df.iterrows():

        # Assert that everything is the same length
        if len(row) != len(target_row):
            return []

        # Skip self
        if row["placeID"] == place_id:
            continue

        sigma = 0
        # Add values
        for idx, dp in enumerate(target_row):
            # Skip placeID
            if idx == 0:
                continue

            error = abs(target_row[idx] - row[idx])
            sigma += error

        # Append rows that do not differ
        if sigma < max_error:
            matches.append(row)

    return matches

In [325]:
placeId = 134999
y = find_similar_places(data, placeId, 1)
print("Found", len(y), "similar places")

Found 34 similar places


In [326]:
print("If you liked", placeId, "you may like one of the following places:")
print('\n'.join(str(x["placeID"]) for x in y))

If you liked 134999 you may like one of the following places:
135082
135070
135086
135042
135025
132954
135046
132869
135063
135000
135034
135060
135081
132845
135011
135016
135044
132717
132861
132706
132872
132733
132755
135043
135049
135075
135062
135058
132754
132584
135085
132626
132866
135072
