<a href="https://colab.research.google.com/github/Elbereth-Elentari/Book_recommender/blob/master/baselines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import recall_score

from IPython.display import clear_output

In [2]:
def read_input(jl_file):
    catalogue_df = pd.read_json(jl_file, lines=True, orient='records').fillna('')
    catalogue_df = catalogue_df[catalogue_df['interesting'] != '']
    catalogue_df.replace({'interesting':'yes'}, 1, inplace=True)
    catalogue_df.replace({'interesting':'no'}, 0, inplace=True)

    def join_tokens(row):
        return ' '.join(row['tokens'])

    catalogue_df['tokens'] = catalogue_df.apply(join_tokens, axis=1)
    return catalogue_df

In [3]:
def create_datasets(catalogue_df):
    X = catalogue_df[['year', 'pages', 'tokens']]
    y = catalogue_df['interesting']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    scaler = MinMaxScaler().fit(X_train[['year', 'pages']])
    X_train_scaled = scaler.transform(X_train[['year', 'pages']])
    X_test_scaled = scaler.transform(X_test[['year', 'pages']])

    cvect = CountVectorizer()
    X_train_counts = cvect.fit_transform(X_train['tokens'])
    X_test_counts = cvect.transform(X_test['tokens'])
    tfidf = TfidfTransformer(use_idf=False)
    X_train_transformed = tfidf.fit_transform(X_train_counts)
    X_test_transformed = tfidf.transform(X_test_counts)

    X_train_array = X_train_transformed.toarray()
    X_test_array = X_test_transformed.toarray()
    X_train_final = np.hstack((X_train_array, X_train_scaled))
    X_test_final = np.hstack((X_test_array, X_test_scaled))

    return X_train_final, X_test_final, y_train, y_test

In [46]:
def create_model_dictionary():
    models = {'KNN':
              {'classifier':KNeighborsClassifier(),
               'grid_values':{'n_neighbors':[2, 3, 4]}},
              'logistic regression':
              {'classifier':LogisticRegression(max_iter=1000),
               'grid_values':{'C':[12, 13, 663, 665, 666]}},
              'SVC linear':
              {'classifier':SVC(kernel='linear'),
               'grid_values':{'C':[1.93, 3.473]}},
              'SVC rbf':
              {'classifier':SVC(kernel='rbf'),
               'grid_values':{'C':[3.26, 6.93]}},
              'Naive Bayes multinomial':
              {'classifier':MultinomialNB(),
               'grid_values':{'alpha':[0.0162, 0.0228]}},
              'Naive Bayes Gaussian':
              {'classifier':GaussianNB(),
               'grid_values':{'var_smoothing':[0.0343, 0.141]}},
              'Decition Tree':
              {'classifier':DecisionTreeClassifier(),
               'grid_values':{'max_depth':[26, 27, 28, 29, 30]}},
              'Random Forest':
              {'classifier':RandomForestClassifier(),
               'grid_values':{'max_features':[2, 3, 4, 5]}},
              'Gradient Boosted Decision Trees':
              {'classifier':GradientBoostingClassifier(n_estimators=29),
               'grid_values':{'max_depth':[5, 10, 15, 20]}},
              'Multi-Layer Perceptron':
              {'classifier':MLPClassifier(solver='lbfgs'),
               'grid_values':{'hidden_layer_sizes':[[25,10,5],[25,5],[30,5],[25,10],[30,15,5]]}}
              }
    return models

In [47]:
def calculate_baselines():
    baselines_df = pd.DataFrame(columns=['param_name', 'en', 'param_en', 'pl', 'param_pl'])
    models = create_model_dictionary()
    catalogue = read_input('/content/drive/My Drive/Library_catalogue_preprocessed.jl')

    for language in ['en', 'pl']:
        print(f'Processing {language}')
        lang_catalogue = catalogue[catalogue['language'] == language]
        X_train, X_test, y_train, y_test = create_datasets(lang_catalogue)

        for model in models:
            print(f'Training {model}')
            grid_values = models[model]['grid_values']
            grid_search = GridSearchCV(models[model]['classifier'], param_grid=grid_values, scoring='recall')
            grid_search.fit(X_train, y_train)
            recall = recall_score(y_test, grid_search.predict(X_test))
            baselines_df.loc[model, language] = recall
            param_name = list(models[model]['grid_values'].keys())[0]
            baselines_df.loc[model, 'param_name'] = param_name
            baselines_df.loc[model, f'param_{language}'] = grid_search.best_params_[param_name]

    clear_output()
    print(f'Mean recall scores')
    return baselines_df

calculate_baselines()

Mean recall scores


Unnamed: 0,param_name,en,param_en,pl,param_pl
KNN,n_neighbors,0.526316,3,0.410526,3
logistic regression,C,0.421053,12,0.505263,663
SVC linear,C,0.526316,1.93,0.526316,3.473
SVC rbf,C,0.421053,3.26,0.568421,6.93
Naive Bayes multinomial,alpha,0.473684,0.0228,0.431579,0.0162
Naive Bayes Gaussian,var_smoothing,0.894737,0.141,0.8,0.0343
Decition Tree,max_depth,0.578947,28,0.557895,29
Random Forest,max_features,0.842105,2,0.357895,3
Gradient Boosted Decision Trees,max_depth,0.526316,50,0.4,100
Multi-Layer Perceptron,hidden_layer_sizes,0.0,"[25, 10, 5]",0.0736842,"[30, 5]"
