In [1]:
import pandas as pd

from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import LinearSVC

In [2]:
reviewframe = pd.read_csv('./data/all_reviews_no_dupes.csv', index_col=[0])

In [3]:
reviewframe['recommend'] = reviewframe['stars'].map({1:-1, 2:-1, 3:-1, 4:1, 5:1})

In [4]:
workframe = reviewframe[['user_id','business_id','recommend']]

In [5]:
workframe = workframe.dropna(axis=0)

In [6]:
train, test = train_test_split(workframe, train_size=0.7, test_size=0.3, random_state=42)

In [7]:
X_train = train[['user_id','business_id']]
X_test = test[['user_id','business_id',]]
y_train = train['recommend']
y_test = test['recommend']

In [8]:
encoder = OneHotEncoder(categories=[workframe['user_id'].unique(), workframe['business_id'].unique()], sparse=True)

In [9]:
X_train_encoded = encoder.fit_transform(X_train)
X_test_encoded = encoder.transform(X_test)

In [10]:
# put the svm in a CalibratedClassifier to get a predict_proba method
svm = LinearSVC()
model = CalibratedClassifierCV(svm) 

In [11]:
model.fit(X_train_encoded, y_train)

CalibratedClassifierCV(base_estimator=LinearSVC())

In [12]:
y_pred = model.predict_proba(X_test_encoded)

In [13]:
from sklearn.metrics import roc_auc_score

In [14]:
print(roc_auc_score(y_test, y_pred[:,1]))

0.7609084510211735
