<a href="https://colab.research.google.com/github/Elbereth-Elentari/Book_recommender/blob/master/AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import f1_score

from IPython.display import clear_output

In [2]:
def join_tokens(row):
    return ' '.join(row['tokens'])

In [3]:
def read_input(jl_file):
    catalogue_df = pd.read_json(jl_file, lines=True, orient='records').fillna('')
    catalogue_df = catalogue_df[catalogue_df['interesting'] != '']
    catalogue_df.replace({'interesting':'yes'}, 1, inplace=True)
    catalogue_df.replace({'interesting':'no'}, 0, inplace=True)
    catalogue_df['tokens'] = catalogue_df.apply(join_tokens, axis=1)
    return catalogue_df

In [4]:
def create_datasets(catalogue_df):
    X = catalogue_df[['year', 'pages', 'tokens']]
    y = catalogue_df['interesting']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    scaler = MinMaxScaler().fit(X_train[['year', 'pages']])
    X_train_scaled = scaler.transform(X_train[['year', 'pages']])
    X_test_scaled = scaler.transform(X_test[['year', 'pages']])

    cvect = CountVectorizer()
    X_train_counts = cvect.fit_transform(X_train['tokens'])
    X_test_counts = cvect.transform(X_test['tokens'])
    tfidf = TfidfTransformer(use_idf=False)
    X_train_transformed = tfidf.fit_transform(X_train_counts)
    X_test_transformed = tfidf.transform(X_test_counts)

    X_train_array = X_train_transformed.toarray()
    X_test_array = X_test_transformed.toarray()
    X_train_final = np.hstack((X_train_array, X_train_scaled))
    X_test_final = np.hstack((X_test_array, X_test_scaled))

    return X_train_final, X_test_final, y_train, y_test

In [18]:
def create_model_dictionary():
    models = {
              'KNN':
              {'classifier':KNeighborsClassifier(),
               'grid_values':{'n_neighbors':[3, 5]}},
              'logistic regression':
              {'classifier':LogisticRegression(max_iter=1000),
               'grid_values':{'C':[80, 100, 150, 600, 700, 800]}},
              'SVC linear':
              {'classifier':SVC(kernel='linear'),
               'grid_values':{'C':[2.5, 3, 3.5, 5.5, 6, 6.5]}},
              'SVC rbf':
              {'classifier':SVC(kernel='rbf'),
               'grid_values':{'C':[3.5, 4, 4.5, 6.5, 7, 7.5]}},
              'Naive Bayes multinomial':
              {'classifier':MultinomialNB(),
               'grid_values':{'alpha':[0.0001, 0.0005, 0.001]}},
              'Naive Bayes Gaussian':
              {'classifier':GaussianNB(),
               'grid_values':{'var_smoothing':[0.06, 0.07, 0.8, 0.15, 0.2, 0.4]}},
              'Decition Tree':
              {'classifier':DecisionTreeClassifier(),
               'grid_values':{'max_depth':[60, 70, 80, 400, 500, 600]}},
              'Random Forest':
              {'classifier':RandomForestClassifier(),
               'grid_values':{'max_features':[2, 3, 4]}},
              'Gradient Boosted Decision Trees':
              {'classifier':GradientBoostingClassifier(),
               'grid_values':{'n_estimators':[19, 20, 21, 26, 30, 32]}},
              'Multi-Layer Perceptron':
              {'classifier':MLPClassifier(solver='lbfgs'),
               'grid_values':{'hidden_layer_sizes':[[25,10,5],[25,5],[30,5],[25,10],[30,15,5],[100,5]]}}
              }
    return models

In [19]:
def calculate_f1s():
    models_df = pd.DataFrame(columns=['param_name', 'en', 'param_en', 'pl', 'param_pl'])
    models = create_model_dictionary()
    catalogue = read_input('/content/drive/My Drive/Library_catalogue_preprocessed.jl')

    for language in ['en', 'pl']:
        print(f'Processing {language}')
        lang_catalogue = catalogue[catalogue['language'] == language]
        X_train, X_test, y_train, y_test = create_datasets(lang_catalogue)

        for model in models:
            print(f'Training {model}')
            grid_values = models[model]['grid_values']
            grid_search = GridSearchCV(models[model]['classifier'], param_grid=grid_values, scoring='f1')
            grid_search.fit(X_train, y_train)
            f1 = f1_score(y_test, grid_search.predict(X_test))
            models_df.loc[model, language] = f1
            param_name = list(models[model]['grid_values'].keys())[0]
            models_df.loc[model, 'param_name'] = param_name
            models_df.loc[model, f'param_{language}'] = grid_search.best_params_[param_name]

    clear_output()
    print(f'Mean F1 scores')
    return models_df

In [20]:
models_df = calculate_f1s()
models_df

Mean F1 scores


Unnamed: 0,param_name,en,param_en,pl,param_pl
KNN,n_neighbors,0.555556,5,0.509804,3
logistic regression,C,0.625,100,0.581818,700
SVC linear,C,0.588235,2.5,0.609756,3.5
SVC rbf,C,0.571429,3.5,0.650602,7
Naive Bayes multinomial,alpha,0.5,0.0001,0.521739,0.0001
Naive Bayes Gaussian,var_smoothing,0.52381,0.8,0.650485,0.2
Decition Tree,max_depth,0.540541,70,0.444444,80
Random Forest,max_features,0.681818,3,0.5,2
Gradient Boosted Decision Trees,n_estimators,0.645161,20,0.137255,30
Multi-Layer Perceptron,hidden_layer_sizes,0.0,"[100, 5]",0.532609,"[25, 5]"


In [None]:
def create_unlabelled_dataset(catalogue_df):

    X_labelled = catalogue_df[catalogue_df['interesting']!='']
    y = X_labelled['interesting']
    X_unlabelled = catalogue_df[catalogue_df['interesting']=='']

    scaler = MinMaxScaler()
    X_labelled_scaled = scaler.fit_transform(X_labelled[['year', 'pages']])
    X_unlabelled_scaled = scaler.transform(X_unlabelled[['year', 'pages']])

    hvect = HashingVectorizer(lowercase=False, n_features=1000)
    X_labelled_tfidf = hvect.fit_transform(X_labelled['tokens'])
    X_unlabelled_tfidf = hvect.transform(X_unlabelled['tokens'])

    tfidf = TfidfTransformer()
    X_labelled_tfidf = tfidf.fit_transform(X_labelled_tfidf)
    X_unlabelled_tfidf = tfidf.transform(X_unlabelled_tfidf)

    X_labelled_tfidf = X_labelled_tfidf.toarray()
    X_labelled = np.hstack((X_labelled_scaled, X_labelled_tfidf))
    X_unlabelled_tfidf = X_unlabelled_tfidf.toarray()
    X_unlabelled = np.hstack((X_unlabelled_scaled, X_unlabelled_tfidf))

    return X_labelled, y, X_unlabelled

In [None]:
jl_file = '/content/drive/My Drive/Library_catalogue_preprocessed.jl'
catalogue = pd.read_json(jl_file, lines=True, orient='records').fillna('')
catalogue.replace({'interesting':'yes'}, 1, inplace=True)
catalogue.replace({'interesting':'no'}, 0, inplace=True)
catalogue['tokens'] = catalogue.apply(join_tokens, axis=1)
catalogue_with_predictions = pd.DataFrame(columns=catalogue.columns)

for language in ['en', 'pl']:
    print(f'Processing {language}')
    lang_catalogue = catalogue[catalogue['language'] == language]
    X_labelled, y, X_unlabelled = create_unlabelled_dataset(lang_catalogue)

    model = GaussianNB(var_smoothing=models_df.loc['Naive Bayes Gaussian', f'param_{language}'])
    model.fit(X_labelled, y)

    print('Predicting if books are interesting')
    predictions = model.predict(X_unlabelled)
    print(predictions)
#    catalogue_unlabelled['interesting'] = model.predict(X_u)
#    print('Saving results in df')
#    catalogue_with_predictions = catalogue_with_predictions.append(catalogue_unlabelled)

Processing en


ValueError: ignored