In [2]:
import nltk, gensim, re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import linear_kernel
from scipy import stats
from gensim import parsing

sns.set_context('talk')

#### Before building an unsupervised machine learning model using the full feature set, we can already use the vectorized form of the 'BODY' column to generate recommendations.

In [8]:
# Instantiate DataFrame of craigslist data
df = pd.read_csv('..\\data\\wrangled_data.csv').sample(10000, random_state = 33).reset_index(drop = True)

In [9]:
df.head()

Unnamed: 0,LISTING_ID,BODY,PRICE,AREA_SQFT,ALLOWS_CATS,ALLOWS_DOGS
0,7243962319,Move-In Special! No Security Deposit with enro...,1800,930.0,1.0,1.0
1,7184667893,"campus, or fun places to shop and dine. Locate...",1700,1259.0,1.0,1.0
2,7216782594,Luxurious Apartment Building Im talking Ferrar...,2050,850.0,1.0,1.0
3,7102358949,SPECIAL! $500 security deposit with your good ...,1800,930.0,0.0,0.0
4,7093981324,PROPERTY INFO ID: 176506860Rent: 2849 / MonthB...,2849,930.0,1.0,1.0


In [10]:
indices = pd.Series(df.index, index=df['LISTING_ID'])

In [11]:
def get_recommendations(id, matrix, indices, number):

    index = indices[id]

    sim_scores = list(enumerate(matrix[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1 : number + 1]

    listing_indices = [i[0] for i in sim_scores]

    return df.iloc[listing_indices]

In [12]:
preprocessed_text = parsing.preprocess_documents(df.BODY)
preprocessed_text = [' '.join(word) for word in preprocessed_text]

In [13]:
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english', min_df = 2, max_df = 0.7)
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_text)
tfidf_matrix.shape

(10000, 7179)

In [14]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[:5]

array([[1.        , 0.41111905, 0.05760425, ..., 0.07819667, 0.02461539,
        0.08991901],
       [0.41111905, 1.        , 0.06795537, ..., 0.069312  , 0.04847409,
        0.15265568],
       [0.05760425, 0.06795537, 1.        , ..., 0.05906828, 0.05608127,
        0.05188825],
       [0.42124283, 0.9781539 , 0.08144368, ..., 0.07986837, 0.05188047,
        0.15536391],
       [0.07892397, 0.09687006, 0.06389695, ..., 0.151926  , 0.09125714,
        0.09710242]])

In [22]:
df.sample(5)

Unnamed: 0,LISTING_ID,BODY,PRICE,AREA_SQFT,ALLOWS_CATS,ALLOWS_DOGS
6786,7137682740,"PROPERTY INFO ID: 159878111Rent: $2,168 / Mont...",2168,930.0,1.0,1.0
5709,7139944097,"PROPERTY INFO ID: 180181693Rent: $5,509 / Mont...",5509,930.0,1.0,1.0
8562,7132823884,"Location: Northampton St., Boston (South End) ...",3995,930.0,1.0,0.0
7846,7202598332,Looking for someone to take my spot in this HU...,810,930.0,1.0,0.0
7006,7154984615,"PROPERTY INFO ID: 226751235Rent: $1,775 / Mont...",1775,930.0,1.0,0.0


In [23]:
print(get_recommendations(7202598332, cosine_sim, indices, 10))

      LISTING_ID                                               BODY  PRICE  \
7407  7177319091  Huge (1150 sq ft + attic/basement), updated 3 ...   1200   
8452  7151640680  Huge room available in Somerville, right next ...   1700   
8077  7151181300  Brighton Center! Newly renovated 4 bedroom 2 b...   3600   
6546  7214324622  Gorgeous 2 bedroom, 1 bath apartment in Teele ...   2800   
4307  7196534791  3BR share is a great cheap option if you're lo...   1700   
1921  7155070264  Large sunny room in the heart of Coolidge corn...    920   
3474  7204187350  Spacious 3 bedroom townhouse style apartment f...   1700   
9     7210842080  Gorgeous 2 floor unit offers 4 bedrooms and 2 ...    925   
1027  7123418152  $2,650.00 per month. Gorgeous 3 bedroom, 2 bat...   2650   
7217  7103316328  Beautiful duplex 2,200SF apartment with 5 bedr...   4550   

      AREA_SQFT  ALLOWS_CATS  ALLOWS_DOGS  
7407     1150.0          1.0          1.0  
8452      930.0          1.0          1.0  
8077     