In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin

# Train Data

In [2]:
train_df = pd.read_csv("train.csv")

train_df.head()

Unnamed: 0,id,url,title,target
0,0,m.kp.md,"Экс-министр экономики Молдовы - главе МИДЭИ, ц...",False
1,1,www.kp.by,Эта песня стала известна многим телезрителям б...,False
2,2,fanserials.tv,Банши 4 сезон 2 серия Бремя красоты смотреть о...,False
3,3,colorbox.spb.ru,Не Беси Меня Картинки,False
4,4,tula-sport.ru,В Новомосковске сыграют следж-хоккеисты алекси...,False


In [3]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]

url = Pipeline([
    ('selector', ColumnSelector(key='url')),
    ('vect', CountVectorizer(ngram_range=(1,3))),
    ('tfidf', TfidfTransformer()),
])

title = Pipeline([
    ('selector', ColumnSelector(key='title')),
    ('vect', CountVectorizer(ngram_range=(1,3))),
    ('tfidf', TfidfTransformer()),
])

feats = FeatureUnion([
    ('url', url),
    ('title', title)  
])

pipeline2features = Pipeline([
    ('features',feats),
    ('classifier', SGDClassifier(loss='hinge', penalty='l2',
                                 alpha=1e-6, random_state=42,
                                 max_iter=20, tol=None, learning_rate='optimal')),
])

In [4]:
X_train, X_test, y_train, y_test = train_test_split(train_df.drop(columns=['id', 'target']), train_df['target'], 
                                                    test_size=0.2, random_state=42)

pipeline2features.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('url',
                                                 Pipeline(steps=[('selector',
                                                                  ColumnSelector(key='url')),
                                                                 ('vect',
                                                                  CountVectorizer(ngram_range=(1,
                                                                                               3))),
                                                                 ('tfidf',
                                                                  TfidfTransformer())])),
                                                ('title',
                                                 Pipeline(steps=[('selector',
                                                                  ColumnSelector(key='title')),
                                                                 ('vect',

In [5]:
f1_score(pipeline2features.predict(X_test), y_test)

0.9702613397416642

In [6]:
pipeline1feature = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,3))),
    ('tfidf', TfidfTransformer()),
    ('classifier', SGDClassifier(loss='hinge', penalty='l2',
                                 alpha=1e-6, random_state=42,
                                 max_iter=20, tol=None, learning_rate='optimal')),
])

X_train, X_test, y_train, y_test = train_test_split(train_df['title'] + ' ; ' + train_df['url'], train_df['target'], 
                                                    test_size=0.33, random_state=42)

pipeline1feature.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer(ngram_range=(1, 3))),
                ('tfidf', TfidfTransformer()),
                ('classifier',
                 SGDClassifier(alpha=1e-06, max_iter=20, random_state=42,
                               tol=None))])

In [7]:
f1_score(pipeline1feature.predict(X_test), y_test)

0.9745507616528322

# Test Data

In [8]:
test_df = pd.read_csv("test.csv")
test_df.head()

Unnamed: 0,id,url,title
0,135309,www.kommersant.ru,Шестой кассационный суд в Самаре начнет работу...
1,135310,urexpert.online,"Что такое индексация алиментов, кем и в каких ..."
2,135311,imperimeha.ru,Женщинам | Империя Меха - Part 12
3,135312,national-porn.com,"Небритые, волосатые киски: Порно всех стран и ..."
4,135313,2gis.ru,67


In [9]:
test_df['target'] = pipeline1feature.predict(test_df['title'] + ' ; ' + test_df['url']).astype(bool)
test_df[['id', 'target']].to_csv("predict.csv", index=False)