I wanted to keep the lecture notebook and seperate out the wine stuff here since it upset me that we were
re-setting variable names each time we changed data sets. And also to avoid running the newsgroups cells
since that took a long time. 

In [2]:
# Import Statements
from sklearn.pipeline import Pipeline 
from sklearn.datasets import fetch_20newsgroups # demo data set 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pandas as pd 
import spacy
nlp = spacy.load("en_core_web_lg")


In [3]:
# load the data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

train.head(1)

Unnamed: 0,id,description,ratingCategory
0,1321,"\nSometimes, when whisky is batched, a few lef...",1


In [4]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

# Brute Force Run on TfidVectorizer

In [5]:
vect = TfidfVectorizer(stop_words='english', ngram_range=(1,3), tokenizer=tokenize)
clf = RandomForestClassifier()
pipe = Pipeline([('vect', vect), ('clf', clf)])
target = 'ratingCategory'
features = 'description'
X_train = train[features]
y_train = train[target]

# model and vectorizer parameters
pipe_params = {
    'vect__max_df': (0.75, 1.0),
    'vect__min_df': (0.02, 0.05),
    'vect__max_features': (500,1000),
    'clf__n_estimators': (5,10),
    'clf__max_depth':(5,10,15,20)
}

In [6]:
X_train.head()

0    \nSometimes, when whisky is batched, a few lef...
1    \nAn uncommon exclusive bottling of a 6 year o...
2    \nThis release is a port version of Amrut’s In...
3    \nThis 41 year old single cask was aged in a s...
4    \nQuite herbal on the nose, with aromas of dri...
Name: description, dtype: object

In [None]:
# grid search
grid_search = GridSearchCV(pipe, pipe_params, cv=3, n_jobs=8, verbose=1)
grid_search.fit(X_train, y_train)
grid_search.best_score_

Fitting 3 folds for each of 64 candidates, totalling 192 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.


In [5]:
# random search, working noticeably faster
random_search = RandomizedSearchCV(pipe, pipe_params, cv=3, n_jobs=8, verbose=1)
random_search.fit(X_train, y_train)
random_search.best_score_

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  30 out of  30 | elapsed:    9.1s finished


0.714705162711035

In [6]:
random_search.predict(test['description'])

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

# LSI SVD 

In [7]:
# instantiate SVD, removed algorithm arg since it defaults to whatever is most efficient
svd = TruncatedSVD(n_components=100, n_iter=3)

svd_params = {
    'lsi__svd__n_components': [10,100,250],
    'lsi__vect__max_df':[.9, .95, 1.0],
    'clf__n_estimators':[5,10,20]
}

# create two seperate pipelines
lsi_pipe = Pipeline([('vect', vect), ('svd', svd)])
svd_pipe = Pipeline([('lsi', lsi_pipe), ('clf', clf)])

# LSI SVD GRID SEARCH

In [8]:
svd_grid_search = GridSearchCV(svd_pipe, svd_params, cv=3, n_jobs=4, verbose=1)
svd_grid_search.fit(X_train, y_train)
svd_grid_search.best_score_

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  3.4min
[Parallel(n_jobs=4)]: Done  81 out of  81 | elapsed:  7.9min finished


0.7191093711769023

# LSI SVD RANDOM SEARCH

In [9]:
svd_random_search = RandomizedSearchCV(svd_pipe, svd_params, cv=3, n_jobs=4, verbose=1)
svd_random_search.fit(X_train, y_train)
svd_random_search.best_score_

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:  3.4min finished


0.7188646929287986

# Spacy Word Embeddings

In [10]:
test = nlp("And we are never ever ever, getting back together")
taylor_swift_vector = test.vector
print(len(taylor_swift_vector))

300


In [11]:
def get_word_vectors(docs: iter):
    return [nlp(doc).vector for doc in docs]

In [12]:
# fit word vectors onto model 
from scipy.stats import uniform

X_train_word_vectors = get_word_vectors(train['description'])
X_test_word_vectors = get_word_vectors(test['description'])
rfc_params = {
            'n_estimators': [200, 700],
            'max_features': ['auto', 'sqrt', 'log2']
                     }
classifier = RandomForestClassifier()


TypeError: '<' not supported between instances of 'str' and 'int'

In [None]:
# need to vectorize in order to run predict so use original pipe is fine
# running the pipe breaks .fit here 
spacy_random = RandomizedSearchCV(classifier, rfc_params, n_jobs=4, verbose=1, cv=5)    
spacy_random.fit(X_train_word_vectors, y_train)
spacy_random.best_score_

In [None]:
spacy_random.predict(X_test_word_vectors)

In [None]:
spacy_grid = GridSearchCV(classifier, rfc_params, n_jobs=4, verbose=1, cv=5)
spacy_grid.fit(X_train_word_vectors, y_train)
spacy_grid.best_score_

In [None]:
spacy_grid.predict(X_test_word_vectors)

In [None]:
#type(svd_random_search)

In [None]:
#type(spacy_grid)

# Function to create Submissions

In [None]:
test = pd.read_csv('./data/test.csv')

In [None]:
test.head(1)

In [None]:
test["description"].dtype

In [None]:
def create_submission(search, subNumber):
    pred = search.predict(test['description'])
    submission = pd.DataFrame({'id': test['id'], 'ratingCategory':pred})
    submission['ratingCategory'] = submission['ratingCategory'].astype('int64')
    assert(submission.shape == (1022, 2))
    submission.to_csv(f'./data/submission{str(subNumber)}.csv', index=False)

In [None]:
# refactor above to take a third argument and condense these 2 functions

def create_transformed_sub(search, subNumber):
    pred = search.predict(X_test_word_vectors)
    submission = pd.DataFrame({'id': test['id'], 'ratingCategory':pred})
    submission['ratingCategory'] = submission['ratingCategory'].astype('int64')
    assert(submission.shape == (1022, 2))
    submission.to_csv(f'./data/submission{str(subNumber)}.csv', index=False)
    

In [None]:
create_submission(grid_search, 1)

In [None]:
create_submission(random_search, 2)

In [None]:
create_submission(svd_grid_search, 3)

In [None]:
create_submission(svd_random_search, 99)

In [None]:
create_transformed_sub(spacy_grid, 7)

In [None]:
create_transformed_sub(spacy_random, 9)

In [None]:
test.dtypes