In [1]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

from random import shuffle
import numpy as np

In [2]:
clf = Pipeline([
    ('vect', HashingVectorizer(alternate_sign=True)),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(max_iter=1000, tol=1e-3))
])

In [3]:
dataset = {"data": [], "target": []}
test_dataset = {"data": [], "target": []}
target_names = [
    'codinghorror', 'dennybritz', 'iamtrask', 'sirajraval'
]

In [4]:
for c, user in enumerate(target_names):
    with open(f'data/{user}_tweets.txt') as f:
        tweets = f.read().split('\n---\n')
        shuffle(tweets)
        
        test_tweets = tweets[:30]
        train_tweets = tweets[30:]
        
        dataset["data"].extend(train_tweets)
        test_dataset["data"].extend(test_tweets)
        
    dataset["target"].extend([c for _ in train_tweets])
    test_dataset["target"].extend([c for _ in test_tweets])

In [5]:
clf.fit(dataset["data"], dataset["target"])

Pipeline(memory=None,
     steps=[('vect', HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 1), non_negative=False,
         norm='l2', pr...'l2', power_t=0.5, random_state=None,
       shuffle=True, tol=0.001, verbose=0, warm_start=False))])

In [6]:
prediction = clf.predict(test_dataset["data"])

print(
    classification_report(
        np.array(test_dataset["target"]), 
        prediction, target_names=target_names
    )
)

              precision    recall  f1-score   support

codinghorror       0.88      0.50      0.64        30
  dennybritz       0.72      0.70      0.71        30
    iamtrask       0.61      0.90      0.73        30
  sirajraval       0.77      0.77      0.77        30

 avg / total       0.75      0.72      0.71       120



In [7]:
from sklearn.model_selection import GridSearchCV

params = {
    'vect__lowercase': (True, False),
    'tfidf__use_idf': (True, False),
}

In [8]:
gs_clf = GridSearchCV(clf, params, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(dataset["data"], dataset["target"])

In [9]:
gs_clf.best_params_

{'tfidf__use_idf': True, 'vect__lowercase': True}