## Doc2vec 
This example is in https://github.com/susanli2016/NLP-with-Python/blob/master/Doc2Vec%20Consumer%20Complaint_3.ipynb

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Consumer_Complaints.csv')
df = df[['Consumer Complaint','Product']]
df = df[pd.notnull(df['Consumer Complaint'])]
df.rename(columns = {'Consumer Complaint':'narrative'}, inplace = True)
df.head(10)

Unnamed: 0,narrative,Product
1,I have outdated information on my credit repor...,Credit reporting
2,I purchased a new car on XXXX XXXX. The car de...,Consumer Loan
7,An account on my credit report has a mistaken ...,Credit reporting
12,This company refuses to provide me verificatio...,Debt collection
16,This complaint is in regards to Square Two Fin...,Debt collection
25,Started the refinance of home mortgage process...,Mortgage
26,"In XXXX, I and my ex-husband applied for a ref...",Mortgage
28,I have disputed several accounts on my credit ...,Credit reporting
29,Mortgage was transferred to Nationstar as of X...,Mortgage
36,"Was a happy XXXX card member for years, in lat...",Credit card


In [3]:
from bs4 import BeautifulSoup
def cleanText(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text) 
    text = re.sub(r'http\S+', r'<URL>', text)
    text = text.lower()
    text = text.replace('x', '')
    return text
df['narrative'] = df['narrative'].apply(cleanText)



In [5]:
df['narrative']


1          i have outdated information on my credit repor...
2          i purchased a new car on  . the car dealer cal...
7          an account on my credit report has a mistaken ...
12         this company refuses to provide me verificatio...
16         this complaint is in regards to square two fin...
                                 ...                        
1025002    our son was taken to       on  , 2012 as an er...
1025003    on //13, without my authorization, bank of ame...
1025006    i had an account with  in // this was previous...
1025007    i was contacted on // email by  from caliber h...
1025009    i had a debit that was included in my chapter ...
Name: narrative, Length: 277814, dtype: object

In [6]:
train, test = train_test_split(df, test_size=0.3, random_state=42)


In [7]:
import nltk
from nltk.corpus import stopwords
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

In [8]:
train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['narrative']), tags=[r.Product]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['narrative']), tags=[r.Product]), axis=1)


KeyboardInterrupt: 

In [None]:
import multiprocessing

cores = multiprocessing.cpu_count()

In [None]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

In [None]:
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

In [None]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [None]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

In [None]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)