In [123]:
import mysql.connector
from mysql.connector import errorcode

TABLES = {}
TABLES['books'] = (
    "CREATE TABLE `books` ("
    "  `bid` int(11) NOT NULL AUTO_INCREMENT,"
    "  `oid` int(11) NOT NULL,"
    "  PRIMARY KEY (`bid`)"
    ")")

TABLES['prints'] = (
    "CREATE TABLE `prints` ("
    "  `isbn` varchar(15) NOT NULL,"
    "  `title` varchar(1024) NOT NULL,"
    "  `authors` varchar(1024) NOT NULL,"
    "  `lexems` varchar(1024) NOT NULL,"
    "  `bookspine` varchar(127) NOT NULL,"
    "  `lang` varchar(127) NOT NULL,"
    "  `genre` varchar(50) NOT NULL,"
    "  `image` varchar(127) NOT NULL,"
    "  `bid` int(11),"
    "  `oid` int(11),"
    "  `description` varchar(1024) NOT NULL,"    
    "  `modified` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,"    
    "  PRIMARY KEY (`isbn`)"
    ")")

TABLES['tags'] = (
    "CREATE TABLE `tags` ("
    "  `isbn` varchar(15) NOT NULL,"
    "  `tag` varchar(50) NOT NULL,"
    "  PRIMARY KEY (`tag`),"
    "  CONSTRAINT `tags_fk` FOREIGN KEY (`isbn`)"
    "     REFERENCES `prints` (`isbn`) ON DELETE CASCADE"
    ")")

TABLES['bookspines'] = (
    "CREATE TABLE `bookspines` ("
    "  `sid` int(11) NOT NULL AUTO_INCREMENT,"
    "  `bid` int(11) NOT NULL,"
    "  `usid` varchar(31) NOT NULL,"
    "  `imid` varchar(31) NOT NULL,"
    "  `box` varchar(127) NOT NULL,"
    "  `text` varchar(127) NOT NULL,"
    "  PRIMARY KEY (`sid`),"
    "  CONSTRAINT `bookspine_fk` FOREIGN KEY (`bid`)"
    "     REFERENCES `books` (`bid`) ON DELETE CASCADE"
    ")")

TABLES['unknowns'] = (
    "CREATE TABLE `unknowns` ("
    "  `uid` int(11) NOT NULL AUTO_INCREMENT,"
    "  `usid` varchar(31) NOT NULL,"
    "  `imid` varchar(31) NOT NULL,"
    "  `box` varchar(127) NOT NULL,"
    "  `text` varchar(127) NOT NULL,"
    "  `bid` int(11) NOT NULL,"
    "  PRIMARY KEY (`uid`)"
    ")")

cnx = mysql.connector.connect(user='root', password='root',
                              host='35.223.45.184',
                              database='biblosphere',
                              use_pure=False)

cursor = cnx.cursor()

for table_name in TABLES:
    table_description = TABLES[table_name]
    try:
        print("Creating table {}: ".format(table_name), end='')
        cursor.execute(table_description)
    except mysql.connector.Error as err:
        if err.errno == errorcode.ER_TABLE_EXISTS_ERROR:
            print("already exists.")
        else:
            print(err.msg)
    else:
        print("OK")

cursor.close()
cnx.close()

Creating table books: OK
Creating table prints: OK
Creating table bookspines: OK
Creating table unknowns: OK


In [122]:
cnx = mysql.connector.connect(user='biblosphere', password='biblosphere',
                              host='127.0.0.1',
                              database='biblosphere',
                              use_pure=False)

cursor = cnx.cursor()

cursor.execute("DROP TABLE PRINTS") 
print("Table PRINTS dropped... ")

cursor.execute("DROP TABLE BOOKSPINES") 
print("Table BOOKSPINES dropped... ")

cursor.execute("DROP TABLE BOOKS") 
print("Table BOOKS dropped... ")

cursor.execute("DROP TABLE UNKNOWNS") 
print("Table UNKNOWNS dropped... ")

cursor.close()
cnx.close()



Table PRINTS dropped... 
Table BOOKSPINES dropped... 
Table BOOKS dropped... 
Table UNKNOWNS dropped... 


In [157]:
def lexems(s, full = False):
    if type(s) is str:
        return [w for w in re.sub('[\;\(\)\"\,\/\&\!\?\:\.\-]|#[0-9]*',' ',s.lower()).split() if full or w not in exclude and len(w) > 1]
    elif type(s) is set:
        return [w.lower() for w in s if full or w not in exclude and len(w) > 1]


class Book:
    def __init__(self, isbn, title, authors):
        self.isbn = isbn
        self.title = title
        self.authors = authors
        self.keys = lexems(title + ' ' + authors, full=True)

query = "SELECT isbn, title, authors, MATCH (lexems) AGAINST (? IN BOOLEAN MODE) AS score FROM prints WHERE MATCH (lexems) AGAINST (? IN BOOLEAN MODE) ORDER BY score DESC limit 1"
#query = "SELECT isbn, title, authors, MATCH (lexems) AGAINST ('good to great collins kuku' IN BOOLEAN MODE) AS score FROM prints WHERE MATCH (lexems) AGAINST ('good to great collins kuku' IN BOOLEAN MODE) ORDER BY score DESC limit 1"


def find_book(cursor, words, trace=False):
    # Only match by two words or more
    if len(words) < 1:
        return None, set(), words

    str_words = ' '.join(words)
    cursor.execute(query, (str_words, str_words,))

    results = cursor.fetchall()
    if trace:
        print(results)
    
    
    # Nothing found
    if len(results) == 0:
        return None, set(), words
    
    # Get book information
    isbn, title, authors, score = results[0]
    
    # Get all words from the title and author
    book = Book(isbn, title, authors)
    
    matched_words = set([w for w in words if w in book.keys])
    unmatched_words = words - matched_words

    # Only match by two words or more
    if len(matched_words) < 1:
        return None, set(), words

    if trace:
        print('Book found (%.2f/%d)' % (score, len(matched_words)), title)

    return book, matched_words, unmatched_words
    
    
cnx = mysql.connector.connect(user='biblosphere', password='biblosphere',
                              host='127.0.0.1',
                              database='biblosphere',
                              use_pure=False)

cursor = cnx.cursor(prepared=True)

book, m, u = find_book(cursor, set('rethinking'.split()), trace=False)

if book is not None:
    print(book.authors, book.title, m, u)
else:
    print('Book not found', m, u)

cursor.close()
cnx.close()

Matthew Alan Cahn Rethinking California: Politics and Policy in the Golden State {'rethinking'} {'all'}


In [121]:
import pandas as pd
import numpy as np
import re
import ast

lang_map = {'Русский (Субтитры)': 'rus',
            'ger': 'deu',
            'pol': 'pol',
            'ara': 'ara',
            'Английский': 'eng',
            'Датский': 'dan',
            'Греческий': 'ell',
            'Тибетский': 'bod',
            'Португальский': 'por',
            'tur': 'tur',
            'Иврит': 'heb',
            'vie': 'vie',
            'por': 'por',
            'Китайский': 'zho',
            'Эвенский': 'eve',
            'Турецкий': 'tur',
            'Древнерусский': 'orv',
            'Вьетнамский': 'vie',
            'Пали': 'pli',
            'Молдавский': 'ron',
            'Корякский': 'kpy',
            'Славянский': 'orv',
            'Финский': 'fin',
            'Татарский': 'tat',
            'Словацкий': 'slk',
            'fil': 'fil',
            'mul': 'mul', 
            'Японский': 'jpn', 
            'Баскский': 'eus', 
            'Хорватский': 'hrv', 
            'Литовский': 'lit', 
            'Французский': 'fra' , 
            'Древнегреческий': 'grc', 
            'Нидерландский': 'nld', 
            'Чувашский': 'chv', 
            'nor': 'nor', 
            'Шведский': 'swe', 
            'dan': 'dan', 
            'Испанский': 'spa', 
            'en-US': 'eng', 
            'eng': 'eng', 
            'en-GB': 'eng', 
            'Хинди': 'hin', 
            'Сербский': 'srp', 
            'nl': 'nld', 
            'Старорусский': 'orv', 
            'jpn': 'jpn', 
            'Словенский': 'slv', 
            'Арабский': 'ara', 
            'Болгарский': 'bul', 
            'Венгерский': 'hun', 
            'rum': 'rum', 
            'ita': 'ita', 
            'Таджикский': 'tgk', 
            'Казахский': 'kaz', 
            'Итальянский': 'ita', 
            'Монгольский': 'mgt', 
            'Польский': 'pol', 
            'Санскрит': 'san', 
            'Латинский': 'lat', 
            'fre': 'fra', 
            'swe': 'swe', 
            'Корейский': 'kor', 
            'Древнееврейский': 'hbo', 
            'ind': 'ind', 
            'Голландский': 'nld', 
            'Белорусский': 'bel', 
            'Немецкий': 'deu', 
            'Румынский': 'ron', 
            'Старославянский': 'chu', 
            'Армянский': 'hye', 
            'Чешский': 'ces', 
            'per': 'fas', 
            'Азербайджанский': 'aze', 
            'spa': 'spa', 
            'Русский': 'rus', 
            'rus': 'rus', 
            'Церковнославянский': 'chu', 
            'Эстонский': 'est', 
            'en-CA': 'eng', 
            'Украинский': 'ukr', 
            'Латышский': 'lav', 
            'Арамейский': 'arc',
            'Язык Не Указан': '',
            'Sound Effects Only Track': '',
            '': ''}


def map_lang(s):
    langs = [lang_map[x.strip()] for x in s.split(',')]
    langs = [l for l in langs if l != '']
    langs = ','.join(langs)
    return langs

def parse_list(s):
    try:
        l = ast.literal_eval(s)
        l = list(set([str(i).strip() for i in l]))
        return l
    except ValueError:
        print("Oops! Wrong string", s)
        return ''

def isbn_check(isbn):
    if type(isbn) == float:
        isbn = '{:.0f}'.format(isbn)
    
    if len(isbn) == 10 and re.match(r'^[0-9]*$', isbn) is not None:
        isbn = '978' + isbn
        
    if len(isbn) == 13 and re.match(r'^[0-9]*$', isbn) is not None:
        check = 0
        for i, ch in enumerate(isbn[0:12]):
            check += int(ch) * (1 + i % 2 * 2)

        check = check % 10
        if check > 0:
            check = 10 - check

        return isbn[0:12]+str(check)
    else:
        return ''

    
def build_keys(s):
    s = [w for w in set(re.sub('[\;\(\)\"\,\/\&\!\?\:\.\-\*\·\|\+\$\'\«\@\\\]',' ',s.lower()).split()) if len(w) > 2]
    return ' '.join(s)

books_db = pd.DataFrame(columns=('title', 'authors', 'isbn', 'image', 'lang'))

b = pd.read_csv('../recommender/datasets/books-russian.csv', usecols = ['authors', 'title', 'isbn', 'image', 'lang'], \
               names = ['i','title','authors','isbn','image','price','tags','lang'], \
               header=0)

b.fillna(value='', inplace=True)
b.isbn = b.isbn.apply(parse_list)
b = b.explode('isbn')
b.isbn = b.isbn.apply(isbn_check)

b = b[b.isbn != '']

books_db = books_db.append(b, ignore_index=True, sort=True)

# Read goodbook-10k
b = pd.read_csv('../recommender/datasets/books-goodbooks10k.csv', usecols = ['authors', 'title', 'isbn', 'lang', 'image'], \
              names = ['id','book_id','best_book_id','work_id','books_count','isbn10','isbn','authors','original_publication_year','original_title','title','lang','average_rating','ratings_count','work_ratings_count','work_text_reviews_count','ratings_1','ratings_2','ratings_3','ratings_4','ratings_5','image','small_image'], \
               header=0)

b.fillna(value='', inplace=True)

b.isbn = pd.to_numeric(b.isbn, errors='coerce')
pd.options.display.float_format = '{:.0f}'.format
b.isbn = b.isbn.map('{:.0f}'.format)
b.isbn = b.isbn.apply(isbn_check)
b = b[b.isbn != '']

books_db = books_db.append(b, ignore_index=True, sort=True)

# Read bookcrossing
b = pd.read_csv('../recommender/datasets/books-bookcrossing.csv', usecols = ['authors', 'title', 'isbn', 'image'], \
                names = ['isbn', 'title', 'authors', "Year-Of-Publication", "Publisher", "small_image", "image", "large_image"], \
                sep=';', encoding='latin-1', header=0)

b.fillna(value='', inplace=True)
b.isbn = b.isbn.apply(isbn_check)

b['lang'] = ''
b = b[b.isbn != '']

books_db = books_db.append(b, ignore_index=True, sort=True)

books_db = books_db[~books_db.duplicated('isbn')]

# Map language codes
books_db.lang = books_db.lang.apply(map_lang)

books_db['keys'] = books_db.authors + ' ' + books_db.title
books_db['keys'] = books_db['keys'].apply(build_keys)

books_db.to_csv('datasets/isbn.csv')

books_db.head()

Unnamed: 0,authors,image,isbn,lang,title,keys
0,,https://cdn1.ozone.ru/multimedia/c250/10239972...,543209081802,rus,"""-Здравствуйте! Это Я!.."": Стихи",стихи здравствуйте это
1,,https://cdn1.ozone.ru/multimedia/c250/10045153...,9785707000317,,"""... лучше воздержаться""",лучше воздержаться
2,,https://cdn1.ozone.ru/multimedia/c250/10176740...,9785367010756,,"""...Все испытал и все проник""",испытал проник все
3,,https://cdn1.ozone.ru/multimedia/c250/10108160...,9785803201816,rus,"""...Дом твоего детства""",дом детства твоего
4,Чейз Джеймс Хедли,https://cdn1.ozone.ru/multimedia/c250/10047088...,9785726004310,rus,"""...И вы будете редактором отдела"". Двойная сдача",будете сдача редактором отдела чейз хедли джей...


In [120]:
def build_keys(s):
    s = [w for w in set(re.sub('[\;\(\)\"\,\/\&\!\?\:\.\-\*\·\|\+\$\'\«\@\\\]',' ',s.lower()).split()) if len(w) > 2]
    return ' '.join(s)


print(build_keys('Hello\ "world" : hello 1"'))
    

world hello


In [None]:
# Export table to CSV file
TABLE prints
    INTO OUTFILE 'C:\\ProgramData\\MySQL\\MySQL Server 8.0\\Uploads\\prints.txt'
    FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"'; ESCAPED BY '\\'
    LINES TERMINATED BY '\n';
    
# Load data fro mCSV file to table
LOAD DATA INFILE 'C:\\ProgramData\\MySQL\\MySQL Server 8.0\\Uploads\\isbn.csv' INTO TABLE prints
    FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' ESCAPED BY ''
    LINES TERMINATED BY '\r\n'
    IGNORE 1 LINES
    (@dummy, authors, image, isbn, lang, title, lexems)
    SET bookspine = '', genre = '', bid = NULL, oid = NULL;

CREATE FULLTEXT INDEX prints_fulltext ON prints(lexems);

LOAD DATA LOCAL INFILE 'C:\\ProgramData\\MySQL\\MySQL Server 8.0\\Uploads\\prints.txt' INTO TABLE prints
    FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"'
    
LOAD DATA LOCAL INFILE 'prints.txt' INTO TABLE prints FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"'

In [5]:
import mysql.connector
from mysql.connector import errorcode

cnx = mysql.connector.connect(user='root', password='root',
                              host='35.223.45.184',
                              database='biblosphere',
                              use_pure=False)

TABLES = {}
TABLES['tags'] = (
    "CREATE TABLE `tags` ("
    "  `tag` varchar(50) NOT NULL,"
    "  `isbn` varchar(15) NOT NULL,"
    "  PRIMARY KEY (`tag`),"
    "  CONSTRAINT `tags_fk` FOREIGN KEY (`isbn`)"
    "     REFERENCES `prints` (`isbn`) ON DELETE CASCADE"
    ")")

cursor = cnx.cursor()

for table_name in TABLES:
    table_description = TABLES[table_name]
    try:
        print("Creating table {}: ".format(table_name), end='')
        cursor.execute(table_description)
    except mysql.connector.Error as err:
        if err.errno == errorcode.ER_TABLE_EXISTS_ERROR:
            print("already exists.")
        else:
            print(err.msg)
    else:
        print("OK")

cursor.close()
cnx.close()

Creating table tags: OK


In [None]:
CREATE USER 'biblosphere' IDENTIFIED BY 'biblosphere';
GRANT ALL PRIVILEGES ON biblosphere.* TO 'biblosphere';

In [13]:
import mysql.connector
from mysql.connector import errorcode

cnx = mysql.connector.connect(user='root', password='root',
                              host='35.223.45.184',
                              database='biblosphere',
                              use_pure=False)

cnx.autocommit = True
cursor = cnx.cursor()

# Delete Sliozi predkov
#cursor.execute("DELETE from prints WHERE ISBN='9785911600495'")
#cursor.execute("DELETE from prints WHERE ISBN='9785389015234'")
cursor.execute("SELECT * from prints WHERE ISBN='9785389015234'")
#cursor.execute("SELECT * from prints limit 1")
#cursor.execute("ALTER TABLE prints ADD COLUMN `modified` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP")    
 

results = cursor.fetchall()
print(results)

cursor.close()
cnx.close()

[]
