<a href="https://colab.research.google.com/github/Elbereth-Elentari/Book_recommender/blob/master/baselines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB

In [2]:
def read_input(jl_file):
    catalogue_df = pd.read_json(jl_file, lines=True, orient='records').fillna('')
    catalogue_df = catalogue_df[catalogue_df['interesting'] != '']
    catalogue_df.replace({'interesting':'yes'}, 1, inplace=True)
    catalogue_df.replace({'interesting':'no'}, 0, inplace=True)

    def join_tokens(row):
        return ' '.join(row['tokens'])

    catalogue_df['tokens'] = catalogue_df.apply(join_tokens, axis=1)
    return catalogue_df

In [3]:
def create_datasets(catalogue_df):
    X = catalogue_df[['year', 'pages', 'tokens']]
    y = catalogue_df['interesting']

    scaler = MinMaxScaler().fit(X[['year', 'pages']])
    X_scaled = scaler.transform(X[['year', 'pages']])

    cvect = CountVectorizer()
    X_counts = cvect.fit_transform(X['tokens'])
    tfidf = TfidfTransformer(use_idf=False)
    X_transformed = tfidf.fit_transform(X_counts)
    X_array = X_transformed.toarray()
    X_final = np.hstack((X_array, X_scaled))

    for iteration in range(50):
        X_final = np.hstack((X_final, X_scaled))

    return X_final, y

In [15]:
def calculate_baselines():
    cross_val_folds = 5
    baselines_df = pd.DataFrame(columns=['param_name', 'en', 'param_en', 'pl', 'param_pl'])

    models = {'KNN':
              {'classifier':KNeighborsClassifier(),
               'grid_values':{'n_neighbors':[3,4,5,6,7]}},
              'logistic regression':
              {'classifier':LogisticRegression(max_iter=4000)},
              'SVC linear':
              {'classifier':SVC(kernel='linear', C=1)},
              'SVC rbf':
              {'classifier':SVC(kernel='rbf', gamma=1)},
              'Naive Bayes multinomial':
              {'classifier':MultinomialNB(alpha=0.1)},
              'Naive Bayes Gaussian':
              {'classifier':GaussianNB()}}

    catalogue = read_input('/content/drive/My Drive/Library_catalogue_preprocessed.jl')

    for language in ['en', 'pl']:
        catalogue_lang = catalogue[catalogue['language'] == language]
        X, y = create_datasets(catalogue_lang)

        for model in models:
            if 'grid_values' in models[model]:
                grid_values = models[model]['grid_values']
                grid_search = GridSearchCV(models[model]['classifier'], param_grid=grid_values, scoring='recall')
                X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
                grid_search.fit(X_train, y_train)
                recall = recall_score(y_test, grid_search.predict(X_test))
                baselines_df.loc[model, language] = recall
                param_name = list(models[model]['grid_values'].keys())[0]
                baselines_df.loc[model, 'param_name'] = param_name
                baselines_df.loc[model, f'param_{language}'] = grid_search.best_params_[param_name]
            else:
                cross_val_scores = cross_val_score(models[model]['classifier'], X, y, cv=cross_val_folds, scoring='recall')
                baselines_df.loc[model, language] = cross_val_scores.mean()

    print(f'Mean recall scores for {cross_val_folds}-fold cross-validation')
    return baselines_df

calculate_baselines()

Mean recall scores for 5-fold cross-validation


Unnamed: 0,param_name,en,param_en,pl,param_pl
KNN,n_neighbors,0.526316,3.0,0.347368,3.0
logistic regression,,0.232749,,0.0783322,
SVC linear,,0.385965,,0.279255,
SVC rbf,,0.288889,,0.073069,
Naive Bayes multinomial,,0.526316,,0.550684,
Naive Bayes Gaussian,,0.566082,,0.688927,
