<a href="https://colab.research.google.com/github/Elbereth-Elentari/Book_recommender/blob/master/baselines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB

from sklearn.metrics import f1_score, confusion_matrix, recall_score

In [5]:
def read_input(jl_file):
    catalogue_df = pd.read_json('/content/drive/My Drive/Library_catalogue_preprocessed.jl', lines=True, orient='records').fillna('')
    catalogue_df = catalogue_df[catalogue_df['interesting'] != '']
    catalogue_df.replace({'interesting':'yes'}, 1, inplace=True)
    catalogue_df.replace({'interesting':'no'}, 0, inplace=True)

    def join_tokens(row):
        return ' '.join(row['tokens'])

    catalogue_df['tokens'] = catalogue_df.apply(join_tokens, axis=1)
    return catalogue_df

In [23]:
def create_datasets(catalogue_df):
    X = catalogue_df[['year', 'pages', 'tokens']]
    y = catalogue_df['interesting']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    scaler = MinMaxScaler().fit(X_train[['year', 'pages']])
    X_train_scaled = scaler.transform(X_train[['year', 'pages']])
    X_test_scaled = scaler.transform(X_test[['year', 'pages']])
    cvect = CountVectorizer()
    X_train_counts = cvect.fit_transform(X_train['tokens'])
    X_test_counts = cvect.transform(X_test['tokens'])

    tfidf = TfidfTransformer(use_idf=False)
    X_train_transformed = tfidf.fit_transform(X_train_counts)
    X_test_transformed = tfidf.transform(X_test_counts)

    X_train_array = X_train_transformed.toarray()
    X_test_array = X_test_transformed.toarray()

    X_final = np.hstack((X_train_array, X_train_scaled))
    X_final_test = np.hstack((X_test_array, X_test_scaled))

    for iteration in range(50):
        X_final = np.hstack((X_final, X_train_scaled))
        X_final_test = np.hstack((X_final_test, X_test_scaled))
    return X_final, X_final_test, y_train, y_test

In [40]:
def knn(X_train, X_test, y_train, y_test):
    best_k = 0
    best_recall = 0

    for k in range(1,15):
        knn = KNeighborsClassifier(n_neighbors=k).fit(X_train, y_train)
        recall = recall_score(y_test, knn.predict(X_test))
        if recall > best_recall:
            best_recall = recall
            best_k = k
    best_knn = KNeighborsClassifier(n_neighbors=best_k).fit(X_train, y_train)
    print('KNN')
    print(confusion_matrix(y_test, best_knn.predict(X_test)))
    return f1_score(y_test, best_knn.predict(X_test))

In [15]:
def logistic_regression(X_train, X_test, y_train, y_test):
    logreg = LogisticRegression().fit(X_train, y_train)
    return accuracy_score(y_test, logreg.predict(X_test))

In [21]:
def SVC_linear(X_train, X_test, y_train, y_test):
    svc = SVC(kernel='linear', C=1).fit(X_train, y_train)
    return accuracy_score(y_test, svc.predict(X_test))

In [22]:
def SVC_rbf(X_train, X_test, y_train, y_test):
    svc_rbf = SVC(kernel='rbf', gamma=1).fit(X_train, y_train)
    return accuracy_score(y_test, svc_rbf.predict(X_test))

In [25]:
def Naive_Bayes_multinomial(X_train, X_test, y_train, y_test):
    multi_nb = MultinomialNB(alpha=0.1).fit(X_train, y_train)
    return accuracy_score(y_test, multi_nb.predict(X_test))

In [27]:
def Naive_Bayes_Gaussian(X_train, X_test, y_train, y_test):
    gauss_nb = GaussianNB().fit(X_train, y_train)
    return accuracy_score(y_test, gauss_nb.predict(X_test))

In [41]:
def calculate_baselines():
    baselines_df = pd.DataFrame(columns=['en', 'pl'])
    models = {'KNN':knn, 'logistic regression':logistic_regression,
              'SVC linear':SVC_linear, 'SVC rbf':SVC_rbf,
              'Naive Bayes multinomial':Naive_Bayes_multinomial,
              'Naive Bayes Gaussian':Naive_Bayes_Gaussian}
    catalogue = read_input('/content/drive/My Drive/Library_catalogue_preprocessed.jl')
    for language in ['en', 'pl']:
        print(language)
        catalogue_lang = catalogue[catalogue['language'] == language]
        data = create_datasets(catalogue_lang)
        for model in models:
            baselines_df.loc[model, language] = models[model](*data)

    return baselines_df

calculate_baselines()

en
KNN
[[62 10]
 [ 8 11]]
pl
KNN
[[231  39]
 [ 50  45]]


Unnamed: 0,en,pl
KNN,0.55,0.502793
logistic regression,0.846154,0.772603
SVC linear,0.857143,0.808219
SVC rbf,0.879121,0.764384
Naive Bayes multinomial,0.824176,0.791781
Naive Bayes Gaussian,0.78022,0.745205
