In [11]:
import pickle
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.semi_supervised import LabelSpreading
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import random

In [2]:
path = '../../data/preprocessed/'
train = pd.read_pickle(path + "train.pkl") # 2838
test = pd.read_pickle(path + "test.pkl") # 715
unlabeled = pd.read_pickle(path+'processed_unlabled.pkl') # 4996817 -> 3000
unlabeled = unlabeled[:3000]
unlabeled['label'] = -1*np.ones(len(unlabeled),dtype='int64')

## Featurization

In [3]:
train_corpus = train['processed_text'].apply(lambda x: ' '.join(x))
test_corpus = test['processed_text'].apply(lambda x: ' '.join(x))
unlab_corpus = unlabeled['processed_text'].apply(lambda x: ' '.join(x))

### Save tf-idf model

In [4]:
X = train_corpus.append(test_corpus).append(unlab_corpus)
X = X.reset_index(drop=True)
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2)) #bigram
tf_transformer = vectorizer.fit(X)
mpath = '../../models/'
# Dump the file
pickle.dump(tf_transformer, open(mpath+"bigram_unlabeled_transformer.pkl", "wb"))

### Load tf-idf model

In [5]:
mpath = '../../models/'
tf_transformer = pickle.load(open(mpath+"bigram_unlabeled_transformer.pkl", 'rb'))
# Create new tfidfVectorizer with old vocabulary
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2),vocabulary = tf_transformer.vocabulary_)

In [6]:
train_vec = vectorizer.fit_transform(train_corpus)
test_vec = vectorizer.fit_transform(test_corpus)
unlab_vec = vectorizer.fit_transform(unlab_corpus)

## Classification

### Only labeled data

### 100% labeled data

In [7]:
stack = hstack([train_vec.transpose(),unlab_vec.transpose()]).transpose()

In [8]:
X_train = stack.toarray()
X_test = test_vec.toarray()
y_train = train['label'].append(unlabeled['label'])
y_test = test['label']

In [13]:
c = list(zip(X_train, y_train))
random.shuffle(c)
X_train, y_train = zip(*c)

#### rbf

In [17]:
rbf100 = LabelSpreading(kernel='rbf')
rbf100.fit(X_train , y_train)

LabelSpreading()

In [18]:
rbf100.score(X_test, y_test)

0.5160839160839161

#### knn

In [None]:
knn100 = LabelSpreading(kernel='knn')
knn100.fit(X_train , y_train)

In [None]:
knn100.score(X_test, y_test)

In [None]:
titles = ['Label Spreading 30% data',
          'Label Spreading 50% data',
          'Label Spreading 100% data',
          'SVC with rbf kernel']