In [1]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score

In [2]:
import numpy as np
import pandas as pd
import os
import scipy as sp
from tqdm import tqdm_notebook
import pickle

In [3]:
from nltk.corpus import movie_reviews
import nltk
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/alexey/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /home/alexey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/alexey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

In [5]:
negfeats = [movie_reviews.words(fileids=[f]) for f in negids]
posfeats = [movie_reviews.words(fileids=[f]) for f in posids]

In [6]:
all_classes = np.array([1]*len(posfeats) + [0]*len(negfeats))
all_reviews = posfeats + negfeats
reviews_better = np.array([' '.join(l) for l in all_reviews])

In [7]:
PATH_TO_DATA = 'product-reviews-sentiment-analysis-light'
TRAIN = 'products_sentiment_train.tsv'
TEST = 'products_sentiment_test.tsv'

In [8]:
df_train = pd.read_csv(os.path.join(PATH_TO_DATA, TRAIN), sep='\t', header=None)
df_train.columns = ['text', 'lab']
df_test = pd.read_csv(os.path.join(PATH_TO_DATA, TEST), sep='\t', index_col='Id')

In [9]:
corpus = np.concatenate([reviews_better, df_train.text, df_test.text])
X = np.concatenate([reviews_better, df_train.text])
y = np.concatenate([all_classes, df_train.lab])

In [19]:
vec = TfidfVectorizer(stop_words='english')
vec.fit(reviews_better)

with open('./simple_demo/BigramUnprocessedVectorizer.pkl', 'wb') as f:
    pickle.dump(vec, f)

In [17]:
pd.Series(y).value_counts()

1    2274
0    1726
dtype: int64

In [20]:
X_transformed = vec.transform(reviews_better)

est = SVC(probability=True)
est.fit(X_transformed, all_classes)

with open('./simple_demo/DefaultLogisticBigramUnprocessedTextSentiment.pkl', 'wb') as f:
    pickle.dump(est, f)