In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import HashingVectorizer

## Подготовка данных (urls)

In [2]:
urls_train_df = pd.read_csv('url_domain_train', header=None, delimiter='\t')
urls_train_df.columns = ['id', 'url', 'count']
urls_train_df = urls_train_df[['id', 'url']]

In [3]:
urls_train_df.head()

Unnamed: 0,id,url
0,000000014B60815F65B38258011B6C01,login.rutracker.org
1,000000014B60815F65B38258011B6C01,rutracker.org
2,000000014C03DA2A47AC433A0C755201,admin.tour-spb.net
3,000000014C03DA2A47AC433A0C755201,czinfo.ru
4,000000014C03DA2A47AC433A0C755201,forumsostav.ru


In [4]:
urls_train_df = pd.DataFrame(urls_train_df.groupby('id')['url'].apply(lambda x: x.tolist()))
urls_train_df['id'] = urls_train_df.index
urls_train_df.index = range(len(urls_train_df))
urls_train_df.columns = ['urls', 'id']

In [5]:
urls_train_df.tail()

Unnamed: 0,urls,id
118598,[dme.ru],E9B7BDD54E733BDF364DD7B4EF74CFEB
118599,"[aif.ru, autochel.ru, chelyabinsk.ru, id.rambl...",E9B9D7D347EB1ACE5AAEEBFCE3FBCE7B
118600,"[blog.partisani.ge, li.ru, tvrain.ru]",EBFAC66B4EE3FB96BA5DD7DDE3787FE7
118601,"[doctorkirov.ru, drive.ru, extrim-park43.ru, m...",F537AD6B46D31ABFF597EFDFE1BDDE71
118602,[samara.drom.ru],F55C7EDB467B9FAE1F97E7DDE1747F6B


In [6]:
print max(urls_train_df['urls'].apply(len))
print np.mean(urls_train_df['urls'].apply(len))

1847
17.2581553586


In [7]:
age_train_df = pd.read_csv('age_profile_train', header=None, delimiter='\t')
age_train_df.columns = ['id', 'age']

In [8]:
age_train_df.head()

Unnamed: 0,id,age
0,000000013CB5719C0000A2C90002C101,53
1,00000001442BE24000001B7D00F50801,48
2,00000001448580F800003F1B31FB0901,28
3,0000000145BDB2FF000157971645E901,44
4,000000014602771F0000DB9359714C01,48


In [9]:
train_df = urls_train_df.merge(age_train_df, on='id', how='left')

In [10]:
train_df.tail()

Unnamed: 0,urls,id,age
118598,[dme.ru],E9B7BDD54E733BDF364DD7B4EF74CFEB,27
118599,"[aif.ru, autochel.ru, chelyabinsk.ru, id.rambl...",E9B9D7D347EB1ACE5AAEEBFCE3FBCE7B,58
118600,"[blog.partisani.ge, li.ru, tvrain.ru]",EBFAC66B4EE3FB96BA5DD7DDE3787FE7,56
118601,"[doctorkirov.ru, drive.ru, extrim-park43.ru, m...",F537AD6B46D31ABFF597EFDFE1BDDE71,31
118602,[samara.drom.ru],F55C7EDB467B9FAE1F97E7DDE1747F6B,35


## Снижение размерности

In [11]:
topk = 20000
X, y = train_df.urls.values[:topk], train_df.age.values[:topk]

In [12]:
X = map(lambda x: ' '.join(x), X)
hw = HashingVectorizer(n_features=1000).fit(X)
X = hw.transform(X).todense()

In [13]:
print X.shape

(20000, 1000)


## Обучение модели

In [14]:
reg = LinearRegression()
- cross_val_score(reg, X, y, scoring='mean_squared_error')

array([ 155.0777652 ,  145.92091826,  161.93119466])

## Отправка Решения

In [15]:
reg = LinearRegression()
reg.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [16]:
urls_test_df = pd.read_csv('url_domain_test', header=None, delimiter='\t')
urls_test_df.columns = ['id', 'url', 'count']
urls_test_df = urls_test_df[['id', 'url']]

In [17]:
urls_test_df = pd.DataFrame(urls_test_df.groupby('id')['url'].apply(lambda x: x.tolist()))
urls_test_df['id'] = urls_test_df.index
urls_test_df.index = range(len(urls_test_df))
urls_test_df.columns = ['urls', 'id']

In [18]:
urls_test_df.head()

Unnamed: 0,urls,id
0,"[1000bankov.ru, 1tv.ru, 4put.ru, argumenti.ru,...",000000014A02348E701552980349FF01
1,"[autorambler.ru, bilettorg.ru, dsol-druzhba.ru...",000000014A10EA183BF8594A0B2AB201
2,"[photosight.ru, rambler.ru]",000000014A4FE5C33A929D4C26943601
3,"[base.consultant.ru, dogovor-obrazets.ru, fd.r...",000000014B7BB9957784A9BC0AC9F401
4,"[assessor.ru, audit-it.ru, base.garant.ru, com...",000000014C7749F896D82C2B01E8B801


In [19]:
X = urls_test_df.urls.values
X = map(lambda x: ' '.join(x), X)
X = hw.transform(X).todense()

In [20]:
y_pred = reg.predict(X)

In [21]:
y_pred

array([ 42.36502418,  41.0370349 ,  39.71385725, ...,  34.58340228,
        37.06780897,  46.90645594])

In [22]:
urls_test_df['age'] = y_pred

In [23]:
urls_test_df = urls_test_df[['id', 'age']]
urls_test_df.columns = ['Id', 'age']

In [24]:
urls_test_df.head()

Unnamed: 0,Id,age
0,000000014A02348E701552980349FF01,42.365024
1,000000014A10EA183BF8594A0B2AB201,41.037035
2,000000014A4FE5C33A929D4C26943601,39.713857
3,000000014B7BB9957784A9BC0AC9F401,33.655781
4,000000014C7749F896D82C2B01E8B801,34.989184


In [27]:
    reg.

{'copy_X': True, 'fit_intercept': True, 'n_jobs': 1, 'normalize': False}

In [25]:
random_sol = pd.read_csv('random_solution.csv')
miss_idx = set(random_sol.Id.values) - set(urls_test_df.Id.values)
miss_df = pd.DataFrame(zip(list(miss_idx), np.ones(len(miss_idx))))
miss_df.columns = ['Id', 'age']

IOError: File random_solution.csv does not exist

In [None]:
urls_test_df = urls_test_df.append(miss_df, ignore_index=True)

In [None]:
urls_test_df.to_csv('solution.csv', index=False)

In [None]:
!wc -l solution.csv