In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../../tribe_dynamics_data.csv')
df.head(1)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,brand_id,worker_id,mturker,post_hash,answer,date,duration_seconds,text,model_decision,timestamped_model,lang
0,0,0,18792,TRIBE_103_allayne.low@gmail.com,False,3ca62dcea583b7aa204fc52fe6b2826c,False,2017-07-16,,・\r\nケイト スペード ニューヨークの\r\n2017Fall Collectionに招...,False,,ja


In [3]:
df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)
df.head(1)

Unnamed: 0,brand_id,worker_id,mturker,post_hash,answer,date,duration_seconds,text,model_decision,timestamped_model,lang
0,18792,TRIBE_103_allayne.low@gmail.com,False,3ca62dcea583b7aa204fc52fe6b2826c,False,2017-07-16,,・\r\nケイト スペード ニューヨークの\r\n2017Fall Collectionに招...,False,,ja


In [4]:
df_small = df[['text','lang','model_decision','answer','mturker']]
df_small.head(3)

Unnamed: 0,text,lang,model_decision,answer,mturker
0,・\r\nケイト スペード ニューヨークの\r\n2017Fall Collectionに招...,ja,False,False,False
1,青春日記【挨拶のハグじゃダメなの】\r\n僕が中学3年生の時だ。\r\n高山私立松倉中学校と...,ja,False,False,False
2,fabric&thingsで本日のみ開催しております\r\nいちごのプチレストラン La F...,ja,False,False,False


In [5]:
english_df = df_small[df_small.lang == 'en'] #CHANGE HERE
english_df.head(3)

Unnamed: 0,text,lang,model_decision,answer,mturker
67,"(near) PORTLAND, OR: Shye Girl and Joey are at...",en,False,False,False
68,Remember 16 year old Daisy (center) who was fo...,en,False,False,False
69,"(near) PORTLAND, OR: 7 year old Maia was so cl...",en,False,False,False


In [6]:
print('Dataset contains {} English posts'.format(english_df.shape[0]))

Dataset contains 9426 English posts


In [7]:
texts_arr = english_df.text.values

In [8]:
unique_texts_arr = np.unique(texts_arr)

In [9]:
print('Dataset contains {} unique English posts'.format(len(unique_texts_arr)))

Dataset contains 2396 unique English posts


In [10]:
english_no_dup_df = english_df.drop_duplicates(inplace=False)

In [11]:
print('No duplicates dataset contains {} English posts'.format(english_no_dup_df.shape[0]))

No duplicates dataset contains 2610 English posts


# Bag of words

In [12]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
english_stemmer = nltk.stem.SnowballStemmer('english')

class ExtendedCountVectorizer(CountVectorizer):

    def build_preprocessor(self):
        preprocessor = super(ExtendedCountVectorizer, self).build_preprocessor()
        return lambda doc: preprocessor(doc.translate(str.maketrans('','','1234567890')))

    def build_analyzer(self):
        analyzer = super(ExtendedCountVectorizer, self).build_analyzer()
        return lambda doc: ([english_stemmer.stem(w) for w in analyzer(doc)])

count_vectorizer = ExtendedCountVectorizer(max_df=0.5, min_df=2, analyzer='word', stop_words='english', binary=True)
vectorized = count_vectorizer.fit_transform(english_no_dup_df['text'])
stemmed_bag_of_words = pd.DataFrame(vectorized.A, columns = count_vectorizer.get_feature_names())

print('The process generated {} bag-of-words variables'.format(len(stemmed_bag_of_words.columns)))

The process generated 13776 bag-of-words variables


In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

bin_variables = [w for w in stemmed_bag_of_words.columns]
bin_labels = english_no_dup_df['answer']
bin_data = stemmed_bag_of_words[bin_variables]

In [15]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold, KFold

bin_clf = MultinomialNB()
bin_clf = bin_clf.fit(bin_data,bin_labels)
bin_scores = cross_val_score(bin_clf, bin_data.values, bin_labels, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=1234))

In [17]:
print('Bag of Words -> 10-fold CV Accuracy {:.2f} +/- {:.2f}'.format(np.mean(bin_scores),np.std(bin_scores)))

Bag of Words -> 10-fold CV Accuracy 0.86 +/- 0.02
