<a href="https://colab.research.google.com/github/Elbereth-Elentari/Book_recommender/blob/master/baselines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB

In [2]:
def read_input(jl_file):
    catalogue_df = pd.read_json(jl_file, lines=True, orient='records').fillna('')
    catalogue_df = catalogue_df[catalogue_df['interesting'] != '']
    catalogue_df.replace({'interesting':'yes'}, 1, inplace=True)
    catalogue_df.replace({'interesting':'no'}, 0, inplace=True)

    def join_tokens(row):
        return ' '.join(row['tokens'])

    catalogue_df['tokens'] = catalogue_df.apply(join_tokens, axis=1)
    return catalogue_df

In [3]:
def create_datasets(catalogue_df):
    X = catalogue_df[['year', 'pages', 'tokens']]
    y = catalogue_df['interesting']

    scaler = MinMaxScaler().fit(X[['year', 'pages']])
    X_scaled = scaler.transform(X[['year', 'pages']])

    cvect = CountVectorizer()
    X_counts = cvect.fit_transform(X['tokens'])
    tfidf = TfidfTransformer(use_idf=False)
    X_transformed = tfidf.fit_transform(X_counts)
    X_array = X_transformed.toarray()
    X_final = np.hstack((X_array, X_scaled))

    for iteration in range(50):
        X_final = np.hstack((X_final, X_scaled))

    return X_final, y

In [10]:
def calculate_baselines():
    cross_val_folds = 5
    baselines_df = pd.DataFrame(columns=['en', 'pl'])

    models = {'KNN':KNeighborsClassifier(n_neighbors=14),
              'logistic regression':LogisticRegression(max_iter=4000),
              'SVC linear':SVC(kernel='linear', C=1),
              'SVC rbf':SVC(kernel='rbf', gamma=1),
              'Naive Bayes multinomial':MultinomialNB(alpha=0.1),
              'Naive Bayes Gaussian':GaussianNB()}

    catalogue = read_input('/content/drive/My Drive/Library_catalogue_preprocessed.jl')

    for language in ['en', 'pl']:
        catalogue_lang = catalogue[catalogue['language'] == language]
        X, y = create_datasets(catalogue_lang)

        for model in models:
            cross_val_scores = cross_val_score(models[model], X, y, cv=cross_val_folds, scoring='recall')
            baselines_df.loc[model, language] = np.mean(cross_val_scores)

    print(f'Mean recall scores for {cross_val_folds}-fold cross-validation')
    return baselines_df

calculate_baselines()

Mean recall scores for 5-fold cross-validation


Unnamed: 0,en,pl
KNN,0.344444,0.0650718
logistic regression,0.232749,0.0783322
SVC linear,0.385965,0.279255
SVC rbf,0.288889,0.073069
Naive Bayes multinomial,0.526316,0.550684
Naive Bayes Gaussian,0.566082,0.688927
