In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
import vectorizer 

In [2]:
class_names = ['Adult', 'Arts', 'Business', 'Computers', 'Games', 'Health', 'Home', 'Kids', 'News', 'Recreation',
               'Reference', 'Science', 'Shopping', 'Society', 'Sports']

train = pd.read_csv('./train_set_preprocessed.csv').fillna(' ')
test = pd.read_csv('./dev_set_preprocessed.csv').fillna(' ')

train_text = train['URL']
test_text = test['URL']
all_text = pd.concat([train_text, test_text])

word_vectorizer = vectorizer.word_vectorizer(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

In [3]:
%load_ext autoreload
%autoreload 2
%connect_info

{
  "stdin_port": 36573, 
  "ip": "127.0.0.1", 
  "control_port": 50571, 
  "hb_port": 59597, 
  "signature_scheme": "hmac-sha256", 
  "key": "648f5df7-9ef54dbe94c8e4770052411d", 
  "kernel_name": "", 
  "shell_port": 45333, 
  "transport": "tcp", 
  "iopub_port": 59877
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-f209534c-0e15-423e-b918-0b080f694f16.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.


In [5]:
char_vectorizer = vectorizer.char_vectorizer(all_text)

train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

In [6]:
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

scores = []
submission = pd.DataFrame.from_dict({'Number': test['Number']})

In [7]:
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(C=0.1, solver='sag')

    cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))

submission.to_csv('submission.csv', index=False)

CV score for class Adult is 0.923463373911


CV score for class Arts is 0.874769177957


CV score for class Business is 0.885247566422


CV score for class Computers is 0.878913032638


CV score for class Games is 0.904674950149


CV score for class Health is 0.882238105877


CV score for class Home is 0.874479643141


CV score for class Kids is 0.864692258752


CV score for class News is 0.848291505741


CV score for class Recreation is 0.848035770681


CV score for class Reference is 0.897966055986


CV score for class Science is 0.875926187227


CV score for class Shopping is 0.87343073994


CV score for class Society is 0.86673920193


CV score for class Sports is 0.912281748043


Total CV score is 0.880743287893


In [15]:
temp = classifier.predict_proba(test_features)[:, 1]
print(class_name)
print(temp[15:20])

Sports
[0.3607101  0.01389347 0.05369641 0.02034296 0.02162324]
