In [1]:
import json
from html.parser import HTMLParser
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def read_json_line(line=None):
    result = None
    try:
        result = json.loads(line)
    except Exception as e:
        # Find the offending character index:
        idx_to_replace = int(str(e).split(' ')[-1].replace(')',''))
        # Remove the offending character:
        new_line = list(line)
        new_line[idx_to_replace] = ' '
        new_line = ''.join(new_line)
        return read_json_line(line=new_line)
    return result


In [2]:
import pandas as pd
from tqdm import tqdm

# ПОЛЕ       ПОЛЕЗНОСТЬ
# ids        да
# spider     полность нет, 1 значение
# timestamp  наверное
# author     да
# content    да
# domain     да
# image_url  наверно
# link_tags  наверно
# meta_tages наверно
# published  да
# tags       полностью нет, 1 пустое значение
# title      да
# url        наверно



def load_ids(path_to_inp_json_file):
    output_list = list()
    with open(path_to_inp_json_file) as inp_file:
        for line in tqdm(inp_file):
            json_data = read_json_line(line)
            id = json_data['_id']
            output_list.append(id)

    return pd.DataFrame({'id': output_list})

def load_timestamps(path_to_inp_json_file):
    output_list = list()
    with open(path_to_inp_json_file) as inp_file:
        for line in tqdm(inp_file):
            json_data = read_json_line(line)
            timestamp = json_data['_timestamp']
            output_list.append(timestamp)

    return pd.DataFrame({'timestamp': output_list})

def load_authors(path_to_inp_json_file):
    output_list = list()
    with open(path_to_inp_json_file) as inp_file:
        for line in tqdm(inp_file):
            json_data = read_json_line(line)
            author = json_data['author']
            output_list.append(author)

    # name
    return pd.DataFrame.from_dict(output_list)[['twitter', 'url']]

def load_contents(path_to_inp_json_file):
    output_list = []
    with open(path_to_inp_json_file) as inp_file:
        for line in tqdm(inp_file):
            json_data = read_json_line(line)
            content = json_data['content'].replace('\n', ' ').replace('\r', ' ')
            content_no_html_tags = strip_tags(content)
            output_list.append(content_no_html_tags)
    return pd.DataFrame(output_list)

def load_domains(path_to_inp_json_file):
    output_list = list()
    with open(path_to_inp_json_file) as inp_file:
        for line in tqdm(inp_file):
            json_data = read_json_line(line)
            domain = json_data['domain']
            output_list.append(domain)

    return pd.DataFrame({'domain': output_list})

def load_image_urls(path_to_inp_json_file):
    output_list = list()
    with open(path_to_inp_json_file) as inp_file:
        for line in tqdm(inp_file):
            json_data = read_json_line(line)
            image_url = json_data['image_url']
            output_list.append(image_url)

    return pd.DataFrame({'image_url': output_list})

def load_link_tags(path_to_inp_json_file):
    output_list = list()
    with open(path_to_inp_json_file) as inp_file:
        for line in tqdm(inp_file):
            json_data = read_json_line(line)
            link_tags = json_data['link_tags']
            output_list.append(link_tags)

    #drop icon, mask-icon, publisher, search
    return pd.DataFrame.from_dict(output_list)[['alternate', 'amphtml', 'apple-touch-icon', 'author', 'canonical', 'stylesheet']]

def load_meta_tags(path_to_inp_json_file):
    output_list = list()
    with open(path_to_inp_json_file) as inp_file:
        for line in tqdm(inp_file):
            json_data = read_json_line(line)
            meta_tags = json_data['meta_tags']
            output_list.append(meta_tags)

    useless_columns = ['al:android:app_name', 'al:android:package', 'al:ios:app_name', 'al:ios:app_store_id',
                       'fb:app_id', 'og:type', 'theme-color', 'twitter:app:id:iphone', 'twitter:app:name:iphone',
                       'twitter:label1', 'viewport']
    output_df = pd.DataFrame.from_dict(output_list)
    return output_df[output_df.columns.difference(useless_columns)]

def load_publisheds(path_to_inp_json_file):
    output_list = list()
    with open(path_to_inp_json_file) as inp_file:
        for line in tqdm(inp_file):
            json_data = read_json_line(line)
            published = json_data['published']
            output_list.append(published['$date'])

    return pd.DataFrame({'published_$date': output_list})

def load_titles(path_to_inp_json_file):
    output_list = list()
    with open(path_to_inp_json_file) as inp_file:
        for line in tqdm(inp_file):
            json_data = read_json_line(line)
            title = json_data['title']
            output_list.append(title)

    return pd.DataFrame({'title': output_list})

def load_urls(path_to_inp_json_file):
    output_list = list()
    with open(path_to_inp_json_file) as inp_file:
        for line in tqdm(inp_file):
            json_data = read_json_line(line)
            url = json_data['url']
            output_list.append(url)

    return pd.DataFrame({'url': output_list})


In [3]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import json
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge


from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()


PATH_TO_DATA = './'

def read_json_line(line=None):
    result = None
    try:
        result = json.loads(line)
    except Exception as e:
        # Find the offending character index:
        idx_to_replace = int(str(e).split(' ')[-1].replace(')',''))
        # Remove the offending character:
        new_line = list(line)
        new_line[idx_to_replace] = ' '
        new_line = ''.join(new_line)
        return read_json_line(line=new_line)
    return result

def preprocess(path_to_inp_json_file):
    output_list = []
    with open(path_to_inp_json_file) as inp_file:
        for line in tqdm(inp_file):
            json_data = read_json_line(line)
            content = json_data['content'].replace('\n', ' ').replace('\r', ' ')
            content_no_html_tags = strip_tags(content)
            output_list.append(content_no_html_tags)
    return output_list

test_raw_content = preprocess(os.path.join(PATH_TO_DATA, 'test.json'),)
train_raw_content = preprocess(os.path.join(PATH_TO_DATA, 'train.json'),)


#cv = CountVectorizer(max_features=100000)
#X_train = cv.fit_transform(train_raw_content)
#X_test = cv.transform(test_raw_content)

train_target = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_log1p_recommends.csv'), index_col='id')
y_train = train_target['log_recommends'].values

#ridge = Ridge(random_state=17)


## HOLDOUT
# train_part_size = int(0.7 * train_target.shape[0])
# X_train_part = X_train[:train_part_size, :]
# y_train_part = y_train[:train_part_size]
# X_valid =  X_train[train_part_size:, :]
# y_valid = y_train[train_part_size:]
#
# ridge.fit(X_train_part, y_train_part);
# ridge_pred = ridge.predict(X_valid)
# valid_mae = mean_absolute_error(y_valid, ridge_pred)
# valid_mae, np.expm1(valid_mae)
#
# ridge.fit(X_train, y_train);



#ridge.fit(X_train, y_train);
#ridge_test_pred = ridge.predict(X_test)


def write_submission_file(prediction, filename,
                          path_to_sample=os.path.join(PATH_TO_DATA, 'sample_submission.csv')):
    submission = pd.read_csv(path_to_sample, index_col='id')

    submission['log_recommends'] = prediction
    submission.to_csv(filename)

#write_submission_file(prediction=ridge_test_pred, filename='first_ridge.csv')


34645it [01:44, 333.09it/s]
62313it [02:58, 348.33it/s]


In [4]:
# подключим необходимые библиотеки
from sklearn.metrics import mean_squared_error
import re
from nltk.corpus import stopwords
import pymorphy2
from gensim.models import word2vec
morph = pymorphy2.MorphAnalyzer()
import nltk
nltk.download('stopwords')

stops = set(stopwords.words("english")) | set(stopwords.words("russian"))

[nltk_data] Downloading package stopwords to /home/danil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def review_to_wordlist(review):
    #1)
    review_text = re.sub("[^а-яА-Яa-zA-Z]"," ", review)
    #2)
    words = review_text.lower().split()
    #3)
    words = [w for w in words if not w in stops]
    #4)
    #words = [morph.parse(w)[0].normal_form for w in words ]
    return(words)

In [9]:
other_data_train = pd.DataFrame()
other_data_test = pd.DataFrame()
other_data_train['published']=load_publisheds(os.path.join(PATH_TO_DATA, 'train.json'))
other_data_test['published']=load_publisheds(os.path.join(PATH_TO_DATA, 'test.json'))

# Преобразуем время
other_data_train['published'] = other_data_train['published'].apply(pd.to_datetime)
other_data_train['year'] = other_data_train['published'].apply(lambda x: x.year)
other_data_train['month'] = other_data_train['published'].apply(lambda x: x.month)

other_data_test['published'] = other_data_test['published'].apply(pd.to_datetime)
other_data_test['year'] = other_data_test['published'].apply(lambda x: x.year)
other_data_test['month'] = other_data_test['published'].apply(lambda x: x.month)


62313it [00:19, 3205.67it/s]
34645it [00:10, 3277.43it/s]


In [6]:
train = pd.DataFrame(train_raw_content,columns=['content'])

In [7]:
%%time
train['content'] = train['content'].apply(review_to_wordlist)

CPU times: user 39.1 s, sys: 1 s, total: 40.1 s
Wall time: 40.1 s


In [11]:
%%time
model = word2vec.Word2Vec(train['content'], size=300, window=10, workers=4)
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

CPU times: user 22min 7s, sys: 2.77 s, total: 22min 10s
Wall time: 5min 52s


  


In [12]:
#Посмотрим чему выучилась модель:
model.wv.most_similar(positive=['open', 'data','science','best'])

[(u'also', 0.543807327747345),
 (u'using', 0.49845319986343384),
 (u'work', 0.49001190066337585),
 (u'use', 0.48959293961524963),
 (u'better', 0.4888975918292999),
 (u'great', 0.486032098531723),
 (u'research', 0.47841358184814453),
 (u'technology', 0.4764072000980377),
 (u'one', 0.4740687608718872),
 (u'matterif', 0.4722328782081604)]

In [13]:
#пропишем класс выполняющий tfidf преобразование.
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

class tfidf_vectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(next(iter(w2v.values())))

    def fit(self, X):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [15]:
tfidf = tfidf_vectorizer(w2v)

In [16]:
%%time
train=tfidf.fit(train['content']).transform(train['content'])

CPU times: user 1h 57min 34s, sys: 1.81 s, total: 1h 57min 36s
Wall time: 1h 57min 36s


In [17]:
def split(train,y,ratio):
    idx = ratio
    return train[:idx, :], train[idx:, :], y[:idx], y[idx:]

In [18]:
Xtr, Xval, ytr, yval = split(train, y_train, 40000)
Xtr.shape,Xval.shape,ytr.mean(),yval.mean()

((40000, 300), (22313, 300), 3.0935355707499999, 2.9762515950342849)

In [19]:
ridge = Ridge(alpha = 1,random_state=7)
ridge.fit(Xtr, ytr)
train_preds = ridge.predict(Xtr)
valid_preds = ridge.predict(Xval)
print('Ошибка на трейне',mean_squared_error(ytr, train_preds))
print('Ошибка на валидации',mean_squared_error(yval, valid_preds))

AttributeError: 'numpy.ndarray' object has no attribute 'median'

In [21]:
print('Ошибка на трейне',mean_absolute_error(ytr, train_preds))
print('Ошибка на валидации',mean_absolute_error(yval, valid_preds))

('\xd0\x9e\xd1\x88\xd0\xb8\xd0\xb1\xd0\xba\xd0\xb0 \xd0\xbd\xd0\xb0 \xd1\x82\xd1\x80\xd0\xb5\xd0\xb9\xd0\xbd\xd0\xb5', 1.3612604571809916)
('\xd0\x9e\xd1\x88\xd0\xb8\xd0\xb1\xd0\xba\xd0\xb0 \xd0\xbd\xd0\xb0 \xd0\xb2\xd0\xb0\xd0\xbb\xd0\xb8\xd0\xb4\xd0\xb0\xd1\x86\xd0\xb8\xd0\xb8', 1.3071529573134417)


**Обучим на всей выборке**

In [None]:
%%time
ridge.fit(train, y_train)

In [28]:
X_test = pd.DataFrame(test_raw_content,columns=['content'])
X_test['content'] = X_test['content'].apply(review_to_wordlist)
X_test = tfidf.transform(X_test['content'])

34645it [01:49, 316.47it/s]


In [29]:
ridge_test_pred = ridge.predict(X_test)
write_submission_file(prediction=ridge_test_pred, filename='ridge_with_wor2vec_and_tfidf.csv')

**Попробуем нейронные сети.**

In [None]:
# подключим библиотеки keras 
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input
from keras.preprocessing.text import Tokenizer
from keras import regularizers
from keras.wrappers.scikit_learn import KerasRegressor

In [None]:
# Опишем нашу сеть.
def baseline_model():
    model = Sequential()
    model.add(Dense(128, input_dim=Xtr.shape[1], kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, kernel_initializer='normal'))

    model.compile(loss='mean_squared_error', optimizer='adam')
    return model
estimator = KerasRegressor(build_fn=baseline_model,epochs=20, nb_epoch=20, batch_size=64,validation_data=(Xval, yval), verbose=2)

In [None]:
estimator.fit(Xtr, ytr)

In [26]:
other_data=pd.DataFrame()
other_data['timestamp']=load_timestamps(os.path.join(PATH_TO_DATA, 'test.json'))
other_data['author_twitter']=load_authors(os.path.join(PATH_TO_DATA, 'test.json'))['twitter']
other_data['author_url']=load_authors(os.path.join(PATH_TO_DATA, 'test.json'))['url']
other_data['domain']=load_domains(os.path.join(PATH_TO_DATA, 'test.json'))
other_data['published']=load_publisheds(os.path.join(PATH_TO_DATA, 'test.json'))
other_data['title']=load_titles(os.path.join(PATH_TO_DATA, 'test.json'))
other_data['image_url']=load_image_urls(os.path.join(PATH_TO_DATA, 'test.json'))
#other_data['meta_tag']=load_meta_tags(os.path.join(PATH_TO_DATA, 'test.json'))
#other_data['link_tags']=load_link_tags(os.path.join(PATH_TO_DATA, 'test.json'))
other_data['url']=load_urls(os.path.join(PATH_TO_DATA, 'test.json'))


34645it [00:11, 3097.98it/s]
34645it [00:12, 2774.77it/s]
34645it [00:11, 3100.38it/s]
34645it [00:10, 3161.17it/s]
34645it [00:10, 3154.94it/s]
34645it [00:11, 2962.71it/s]
34645it [00:11, 3037.47it/s]
34645it [00:11, 3105.80it/s]


In [105]:
other_data_train = pd.DataFrame()
other_data_train['title'] = load_titles(os.path.join(PATH_TO_DATA, 'train.json'))
#other_data_train['published']=load_publisheds(os.path.join(PATH_TO_DATA, 'train.json'))
#other_data_train['domain']=load_domains(os.path.join(PATH_TO_DATA, 'train.json'))
#other_data_train['domain']=other_data_train['domain'].str.split('.').str.get(-1)
#le_domen=LabelEncoder()
#other_data_train['domain']=le_domen.fit_transform(other_data_train['domain'])


#published_year_month_scaler=StandardScaler()
#other_data_train['published_year_month']=pd.DatetimeIndex(other_data_train['published']).year*100+pd.DatetimeIndex(other_data_train['published']).month
#other_data_train['published_year_month']=published_year_month_scaler.fit_transform(other_data_train['published_year_month'].as_matrix().reshape(-1,1))

other_data_test = pd.DataFrame()
other_data_test['title'] = load_titles(os.path.join(PATH_TO_DATA, 'test.json'))
#other_data_test['published']=load_publisheds(os.path.join(PATH_TO_DATA, 'test.json'))
#other_data_test['published_year_month']=pd.DatetimeIndex(other_data_test['published']).year*100+pd.DatetimeIndex(other_data_test['published']).month
#other_data_test['published_year_month']=published_year_month_scaler.transform(other_data_test['published_year_month'].as_matrix().reshape(-1,1))
#other_data_test['domain']=load_domains(os.path.join(PATH_TO_DATA, 'test.json'))
#other_data_test['domain']=other_data_test['domain'].str.split('.').str.get(-1)
#other_data_test['domain']=le_domen.transform(other_data_test['domain'])

CountVectorizer_title = CountVectorizer()
other_data_train_title = CountVectorizer_title.fit_transform(other_data_train['title'])
other_data_test_title = CountVectorizer_title.transform(other_data_test['title'])

In [117]:
data_train = csr_matrix(hstack([other_data_train_title, X_train]))
data_test = csr_matrix(hstack([other_data_test_title, X_test]))
ridge.fit(data_train,y_train)
ridge_test_pred = ridge.predict(data_test)
write_submission_file(prediction=ridge_test_pred, filename='second_ridge.csv')

TypeError: no supported conversion for types: (dtype('int64'), dtype('O'), dtype('int64'))

In [118]:
other_data_train['published_year_month'].values.reshape(-1,1)
other_data_train['domain'].values.reshape(-1,1)

array([[u'medium.com'],
       [u'medium.com'],
       [u'medium.com'],
       ..., 
       [u'medium.com'],
       [u'medium.com'],
       [u'byrslf.co']], dtype=object)