In [1]:
from sklearn.datasets import load_files
import pandas as pd
import datetime
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize          
from nltk.stem.porter import PorterStemmer
from sklearn import preprocessing
import string
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import nltk
from nltk.stem.snowball import SnowballStemmer

%matplotlib inline

In [2]:
trainer = load_files('data')
df = pd.DataFrame()
df['filename'] = trainer.filenames
df['text'] = [open(f, 'r').read() for f in trainer.filenames]
df['label'] = [trainer.target_names[x] for x in trainer.target]
df['author'] = [x.split('__')[0].split('--')[0].split('/')[-1] for x in df['filename']]
df['date'] = pd.to_datetime([x.split('__')[-1].split('-')[-1].split('.txt')[0].strip() for x in df['filename']], 
                            errors='coerce')

df.dropna(inplace=True)

In [3]:
df['date'] = df['date'].astype(np.int64) // 10 ** 9
df['date'] = pd.to_datetime(df['date'], unit='s')

df_active_labels = df[['label','date']].groupby('label').max().sort_values('date',ascending=True).reset_index()
df_active_labels = df_active_labels[df_active_labels.date > datetime.date.today() - datetime.timedelta(1460)]
df = df[df['label'].isin(df_active_labels.label)]

df_large_categories = df.groupby('label').count()[df.groupby('label').count()['filename'] > 5].reset_index()[['label','filename']]
df = df[df['label'].isin(df_large_categories.label)]

In [4]:
le = preprocessing.LabelEncoder()
y = le.fit_transform(df.label)

In [5]:
xtrain, xvalid, ytrain, yvalid = train_test_split(df.text.tolist(), y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1)

In [None]:
stemmer = SnowballStemmer('english')

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    tokens = [i for i in tokens if i not in string.punctuation]
    tokens = [i for i in tokens if all(j.isalpha() or j in string.punctuation for j in i)]
    tokens = [i for i in tokens if '/' not in i]
    stems = stem_tokens(tokens, stemmer)
    return stems

In [48]:
vect = CountVectorizer(min_df=3, max_df=0.7, max_features=200000, tokenizer=tokenize,
            strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words='english')

vect.fit(list(xtrain) + list(xvalid))
xtrain_cv = vect.transform(xtrain)
xvalid_cv = vect.transform(xvalid)

In [7]:
tfv = TfidfVectorizer(min_df=3, max_df=0.7, max_features=200000, tokenizer=tokenize,
            strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1,
            stop_words = 'english')

tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)

In [39]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

I tested the following with C=1 and C=0.5. C=1 won.

In [80]:
from sklearn.linear_model import LogisticRegression

tf_clf = LogisticRegression(C=1)
tf_clf.fit(xtrain_tfv, ytrain)
predictions_tf_clf = tf_clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions_tf_clf))

logloss: 3.356 


In [50]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(C=0.5)
clf.fit(xtrain_cv, ytrain)
predictions_cv_clf = clf.predict_proba(xvalid_cv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions_cv_clf))

logloss: 6.769 


  np.exp(prob, prob)


In [51]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(xtrain_tfv.toarray(), ytrain)
predictions_tf_dtc = dtc.predict_proba(xvalid_tfv.toarray())

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions_tf_dtc))

logloss: 30.848 


In [40]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(xtrain_tfv.toarray(), ytrain)
predictions_tf_rfc = rfc.predict_proba(xvalid_tfv.toarray())

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions_tf_rfc))

logloss: 22.736 


In [102]:
with open('test_doc.txt','r') as f:
    text = f.read()

tf_test =  tfv.transform([text]) 
text_pred = tf_clf.predict_proba(tf_test)

In [106]:
test_df = pd.DataFrame(columns=['label','probability'])
test_df['label'] = le.inverse_transform(predictions_tf_clf[0].all())[0]
test_df['probability'] = text_pred[0]
test_df.sort_values('probability', ascending=False).reset_index(drop=True)

Unnamed: 0,label,probability
0,Federal Fiscal Policy,0.156129
1,Entitlements,0.058351
2,State and Local Policy,0.057895
3,Public Sector Pensions,0.052431
4,"Spending, Deficits, & Debt",0.047316
5,Taxes,0.031444
6,Economics and Public Policy,0.030491
7,Regulation,0.030060
8,Budget Process,0.028218
9,Regulatory Process Reform,0.025590
