In [1]:
import pandas as pd
import xgboost as xgb

In [2]:
users = pd.read_csv('../data/Users.csv')
books = pd.read_csv('../data/Books.csv')
ratings = pd.read_csv('../data/Ratings.csv')

  books = pd.read_csv('../data/Books.csv')


In [3]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [4]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [5]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


Let's slice our data to 5k users for demo purposes.

In [6]:
users = users.iloc[5000:10_000]
users.head()

Unnamed: 0,User-ID,Location,Age
5000,5001,"inverness, california, usa",
5001,5002,"milano, milano, italy",
5002,5003,"snohomish, washington, usa",
5003,5004,"valència, país valencià, spain",23.0
5004,5005,"olesa de montserrat, barcelona, spain",17.0


In [7]:
users.isna().sum()

User-ID        0
Location       0
Age         1794
dtype: int64

Let's fill missing values with `0` and convert `user_location` to `user_location_country` by taking only country name from `user_location` string.

In [8]:
users["Age"].fillna(users["Age"].mean(), inplace=True)

In [9]:
users["Location"] = users["Location"].apply(lambda x: x.split(", ")[-1])
users.head()

Unnamed: 0,User-ID,Location,Age
5000,5001,usa,32.034934
5001,5002,italy,32.034934
5002,5003,usa,32.034934
5003,5004,spain,23.0
5004,5005,spain,17.0


In [10]:
loc_counts = users.Location.value_counts()
loc_counts

usa                2298
united kingdom      369
spain               358
canada              328
italy               256
                   ... 
öð¹ú                  1
kazakhstan            1
the philippines       1
cyprus                1
trinidad              1
Name: Location, Length: 151, dtype: int64

Let's save users with only top-5 countries to `data/samples.csv`.

In [11]:
users["Location"] = users["Location"].apply(lambda x: "other" if loc_counts[x] < 256 else x)
users = users[users["Location"] != "other"]
users.head()

Unnamed: 0,User-ID,Location,Age
5000,5001,usa,32.034934
5001,5002,italy,32.034934
5002,5003,usa,32.034934
5003,5004,spain,23.0
5004,5005,spain,17.0


Now we can apply one-hot encoding to `Location` column. We will use `pandas.get_dummies` for that.

In [12]:
users = pd.get_dummies(users, columns=["Location"])
users.head()

Unnamed: 0,User-ID,Age,Location_canada,Location_italy,Location_spain,Location_united kingdom,Location_usa
5000,5001,32.034934,0,0,0,0,1
5001,5002,32.034934,0,1,0,0,0
5002,5003,32.034934,0,0,0,0,1
5003,5004,23.0,0,0,1,0,0
5004,5005,17.0,0,0,1,0,0


In [13]:
users.shape

(3609, 7)

Save rating only for present users

In [14]:
ratings = ratings[ratings["User-ID"].isin(users["User-ID"])]
ratings.shape

(14758, 3)

Let's move to the books dataset.
For demo purposes we will take only 5k books and 1 feature `Year-Of-Publication`.

In [15]:
train_df = pd.merge(ratings, users, on="User-ID")
train_df = pd.merge(books, train_df, on="ISBN")
train_df = train_df[["Year-Of-Publication", "Book-Rating", "Age", "Location_canada", "Location_italy", "Location_spain", "Location_united kingdom", "Location_usa"]]
train_df.columns = ["book_publication_year", "rating", "user_age", "user_in_canada", "user_in_italy", "user_in_spain", "user_in_united_kingdom", "user_in_usa"]
train_df["book_publication_year"] = train_df["book_publication_year"].astype(int)
train_df.head()

Unnamed: 0,book_publication_year,rating,user_age,user_in_canada,user_in_italy,user_in_spain,user_in_united_kingdom,user_in_usa
0,1999,0,31.0,0,0,0,0,1
1,1999,9,49.0,0,0,0,0,1
2,1999,8,53.0,0,0,0,0,1
3,1999,5,32.034934,0,0,0,0,1
4,1999,0,32.034934,0,0,0,0,1


In [16]:
train_df.shape

(13589, 8)

Usually for target people we take only those books that have rating >= 7. So let's do the same.

In [17]:
train_df["liked"] = (train_df["rating"] >= 7).astype(int)
train_df = train_df.drop("rating", axis=1)
train_df.head()

Unnamed: 0,book_publication_year,user_age,user_in_canada,user_in_italy,user_in_spain,user_in_united_kingdom,user_in_usa,liked
0,1999,31.0,0,0,0,0,1,0
1,1999,49.0,0,0,0,0,1,1
2,1999,53.0,0,0,0,0,1,1
3,1999,32.034934,0,0,0,0,1,0
4,1999,32.034934,0,0,0,0,1,0


In [18]:
train_df.liked.value_counts()

0    9264
1    4325
Name: liked, dtype: int64

Applying min-max normalization. It's required for quantization.

In [19]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
train_df[["book_publication_year", "user_age"]] = scaler.fit_transform(train_df[["book_publication_year", "user_age"]])

Now let's split our data into train and test sets

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    train_df.drop("liked", axis=1), 
    train_df["liked"], 
    stratify=train_df["liked"],
    test_size=0.2,
    random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2, random_state=42)

Okay... let's create our model

In [21]:
model = xgb.XGBRegressor(
    n_estimators=10,
    max_depth=6,
    learning_rate=0.1,
    random_state=42
)

model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10)

[0]	validation_0-rmse:0.49271
[1]	validation_0-rmse:0.48659
[2]	validation_0-rmse:0.48132
[3]	validation_0-rmse:0.47704
[4]	validation_0-rmse:0.47371
[5]	validation_0-rmse:0.47118
[6]	validation_0-rmse:0.46892
[7]	validation_0-rmse:0.46721
[8]	validation_0-rmse:0.46591
[9]	validation_0-rmse:0.46480




Let's see how our model performs on test set

In [22]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)
y_pred = (y_pred >= 0.5).astype(int)
accuracy_score(y_test, y_pred)

0.6762325239146432

In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.95      0.80      1853
           1       0.45      0.08      0.14       865

    accuracy                           0.68      2718
   macro avg       0.57      0.52      0.47      2718
weighted avg       0.61      0.68      0.59      2718



The model is poor, but it's not the point of this project.

Okay, now we can save our model and move to transpilation to Aleo Smart Contracts.
Comeback to [README.md](./README.md) and follow instructions there.

In [24]:
import pickle
pickle.dump(model, open("../artifacts/model.pkl", "wb"))

Also let's save test data for later use

In [25]:
test = X_test.sample(4)
test["liked"] = y_test[test.index]
test.to_csv("../artifacts/test.csv", index=False)