## Unite all data

In [83]:
import pandas as pd
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import nltk
import multiprocessing
from sklearn.metrics import accuracy_score, f1_score

cores = multiprocessing.cpu_count()

positives=pd.read_csv('Data/positive_combined_withreal.csv')
cases=pd.read_csv('Data/case_files.csv')

positives['class']=positives['class'].str.replace(' ','')
cases_clean=pd.DataFrame(data={'class':cases['Case Type'],'complain':cases['Case Summary']})

df=pd.concat([positives,cases_clean], axis= 0)
df['class']=df['class'].apply(lambda x: x[0].upper()+x[1:])
df['class'].unique()

array(['Notrelated', 'Labor', 'Adult', 'Minor'], dtype=object)

## Text preprocessing

In [84]:
nltk.download('punkt')
from nltk.corpus import stopwords
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

[nltk_data] Downloading package punkt to /Users/maria/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [85]:
pd.set_option('display.max_colwidth', -1)
df=df.applymap(lambda x: re.sub("[^a-zA-Z0-9 ]",'',str(x)))
df['complain']=df['complain'].apply(lambda x: re.sub('notrelated|nannan','',x))
df=df.sample(frac=1) #randomize our groups

In [86]:
df[df['class']=='Notrelated'].head(10)

Unnamed: 0,class,complain
143,Notrelated,Frequent errors with overtime pay Operations managers are constantly changing and sometimes send unqualified officers to client sites Theres not much room for career growth I felt like my hard work and contributions were ignored by the company and that my managers attention was focused on other accounts
189,Notrelated,Certain managers in the midlands are very sly and big gossips they all talk about employees behind their back You cant confide in your manager without the rest of your team knowing everything Some managers need to actually pick up the phone and call new clients instead of resting on a few clients who theyve poached from previous roles
164,Notrelated,Upward mobility was somewhat limited
85,Notrelated,Everybody have to work on the weekends
281,Notrelated,You get paid minimum wage get lied to about raises and promotions on a constant basis for no apparent reason theres an ever increasing gap in communication between retail and corporate the CEO feels the need to walk into your store unannounced to yell at you about random things he wants to change theres no flexibility in scheduling whatsoever they understaff stores and force people to work two per shift no matter how heavy traffic can be
191,Notrelated,Like any company there are lots of transitions and growing pains Sometimes there are internal conflicts so you have to think different and be flexible to see what you can do to achieve your goals Take the initiative create actionable plans be resourceful and stay positive to keep yourself on track
61,Notrelated,I really dont have any cons
260,Notrelated,Management in the past has been very resistant to change and very inefficient This I assure you has been changed Recruiter positions currently do not offer lots of room for growth As a member of management I assure you that this will also change We are aware of this
157,Notrelated,Micro management and zero respect for work life balance
84,Notrelated,Early days Late nights Difficult helpers Misplaced packages


In [87]:
train, test = train_test_split(df, test_size=0.3, random_state=42)
train_tagged = train.apply(lambda r: TaggedDocument(words=tokenize_text(r['complain']), tags=[r['class']]), axis=1)
test_tagged = test.apply(lambda r: TaggedDocument(words=tokenize_text(r['complain']), tags=[r['class']]), axis=1)


In [88]:
train_tagged.values[30]

TaggedDocument(words=['management', 'can', 'not', 'stress', 'this', 'enough', 'they', 'hire', 'desperate', 'transplants', 'who', 'would', 'rather', 'try', 'and', 'be', 'friend', 'to', 'only', 'get', 'you', 'fired', 'very', 'weird', 'creepy', 'and', 'handsy', 'if', 'not', 'that', 'the', 'managers', 'were', 'always', 'absent', 'on', 'peak', 'days', 'or', 'would', 'over', 'staff', 'on', 'slow', 'days', 'there', 'was', 'no', 'team', 'building', 'managers', 'would', 'only', 'play', 'into', 'the', 'games', 'and', 'trash', 'talk', 'they', 'also', 'punished', 'people', 'for', 'write', 'ups', 'to', 'get', 'them', 'fired', 'when', 'they', 'knew', 'who', 'was', 'stealing', 'to', 'add', 'if', 'they', 'would', 'stay', 'to', 'close', 'these', 'things', 'wouldnt', 'happen', 'also', 'to', 'add', 'there', 'was', 'no', 'way', 'to', 'voice', 'concerns', 'people', 'would', 'fake', 'care', 'and', 'tell', 'whomever', 'is', 'the', 'issue', 'to', 'cause', 'further', 'problems', 'rather', 'than', 'resolve', 'i

## Training the model

In [89]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in train_tagged.values])

In [90]:
model_dbow.train(train_tagged.values, total_examples=len(train_tagged.values),epochs=30)

In [91]:
 def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [92]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

In [93]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [94]:
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))


Testing accuracy 0.6298850574712643
Testing F1 score: 0.6128767626559046


## Test from differen source

https://www.theguardian.com/global-development/2017/jul/29/slept-floor-flat-near-harrods-stories-modern-slavery

In [95]:
text1='When my husband became very sick and couldn’t work, I used an employment agency to find me work abroad. I was sent to Qatar, but the family were cheating me, paying me less than agreed in my contract and refusing to give me a day off. I called the agency in the Philippines for help, but they never answered. I had to send money back home to pay for food, school fees and medicine. I fought with my employer about my salary, but he would say Your contract is just a piece of paper'

text2='I worked 12-hour shifts and finished every day at 6pm, the same time that the gang curfew in our city came into effect. There are two main gangs in the area, and anyone on the streets after the curfew becomes a target. Every day I thought might be my last. One evening, my co-worker and I were walking to the bus stop when three gang members stopped and said we’d have to sell sex and drugs for them. “We’re not asking you,” they said. “We’re giving you an order.” They let us go, but I was terrified. The next night after work, they were there, waiting for us. “Time’s up,” they said, and they forced us into a car at gunpoint'

text3='One of my friends in the village said he and a few others were leaving to find work. The next day we all got a taxi and headed for Thailand. We were met by a man who said we could work on his cassava farm, earning $130 (£99) a month each, with room and board included. We worked seven days a week, morning until night, for a month, until one evening a Thai man asked how much we were earning. He offered us $200 a month to work on a construction site, but said we’d have to move to Thailand We were confused. Weren’t we already in Thailand? It turned out we were still in Cambodia, and the farmer had already fled without giving us any wages. We were left with no choice but to accept the deal and smuggle ourselves over the border. The man said we’d be charged for being driven to the construction site, but that it could be deducted from our first month’s wages. It was a long, uncomfortable drive in a pickup, and when we finally stopped, we saw that we weren’t at a construction site, but a busy sea port. The broker said the building site had closed, so he’d arranged for us to work on a fishing boat instead'


In [96]:
def check_independent(text):
    text=re.sub("[^a-zA-Z0-9 ]",'',str(text))
    words=tokenize_text(text)
    regg=model_dbow.infer_vector(words, steps=20)
    print(logreg.predict([regg]))

In [97]:
check_independent(text3)

['Labor']


In [98]:
check_independent(text1)

['Labor']


In [99]:
check_independent(text2)

['Minor']


In [102]:
import pickle

pickle.dump([model_dbow,logreg],open('word2vecworking1.pickle','wb'))