In [2]:
# import libraries
import pandas as pd
import numpy as np
# to transform words in vectors and to build and organize models
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
# to process words
# to download nltk packages de-comment next line
# nltk.download() 
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.snowball import ItalianStemmer

In [3]:
# import data
src = pd.read_csv('../../Data/tribe_dynamics_data.csv')
src.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'brand_id', 'worker_id', 'mturker',
       'post_hash', 'answer', 'date', 'duration_seconds', 'text',
       'model_decision', 'timestamped_model', 'lang'],
      dtype='object')

In [4]:
# convert False to 0s and True to 1s
src['answer'] = (src['answer'] == True).astype(int)

In [5]:
# select posts language (i.e. english and italian)
src=src[['answer','text','lang']]
df_all = src[(src.lang == 'en') | (src.lang == 'it')]
df_ita = src[src['lang'] == 'it']
df_eng = src[src['lang'] == 'en']
print(df_all.shape, df_ita.shape, df_eng.shape)

(9895, 3) (469, 3) (9426, 3)


In [8]:
# randomely extract a data sample of same name observation as other dataframe
df_eng_reduced = df_eng.sample(n=df_ita.shape[0])
df_eng_reduced.shape

(469, 3)

In [72]:
#divide df in english among true and false
df_eng_true = df_eng[df_eng['answer'] == True]
df_eng_false = df_eng[df_eng['answer'] == False]
#randomely select true and false as in the dataframe of posts in italian
df_eng_reduced_true = df_eng_true.sample(n=df_ita[df_ita['answer'] == True].shape[0])
df_eng_reduced_false = df_eng_false.sample(n=df_ita[df_ita['answer'] == False].shape[0])
#combine true and false of the reduced dataframe
df_eng_reduced_weighted = pd.concat([df_eng_reduced_true, df_eng_reduced_false])
df_eng_reduced_weighted.shape

(469, 3)

In [46]:
# convert text and classifier in arrays
X_all = np.asarray(df_all['text'])
Y_all = np.asarray(df_all['answer'], dtype="|S6")
X_eng = np.asarray(df_eng['text'])
Y_eng = np.asarray(df_eng['answer'], dtype="|S6")
X_eng_reduced = np.asarray(df_eng_reduced['text'])
Y_eng_reduced = np.asarray(df_eng_reduced['answer'], dtype="|S6")
X_eng_reduced_weighted = np.asarray(df_eng_reduced_weighted['text'])
Y_eng_reduced_weighted = np.asarray(df_eng_reduced_weighted['answer'], dtype="|S6")
X_ita = np.asarray(df_ita['text'])
Y_ita = np.asarray(df_ita['answer'], dtype="|S6")

In [63]:
# split in train and test
X_train_all, X_test_all, Y_train_all, Y_test_all = train_test_split(X_all, Y_all, test_size=0.4, random_state=42)
X_train_eng, X_test_eng, Y_train_eng, Y_test_eng = train_test_split(X_eng, Y_eng, test_size=0.4, random_state=42)
X_train_eng_reduced, X_test_eng_reduced, Y_train_eng_reduced, Y_test_eng_reduced = train_test_split(X_eng_reduced, Y_eng_reduced, test_size=0.4, random_state=42)
X_train_eng_reduced_weighted, X_test_eng_reduced_weighted, Y_train_eng_reduced_weighted, Y_test_eng_reduced_weighted = train_test_split(X_eng_reduced_weighted, Y_eng_reduced_weighted, test_size=0.4, random_state=42)
X_train_ita, X_test_ita, Y_train_ita, Y_test_ita = train_test_split(X_ita, Y_ita, test_size=0.4, random_state=42)

In [64]:
# Posts written in English
# number of datapoints labeled False and True
unique, counts = np.unique(Y_eng, return_counts=True)
print (np.asarray((unique, counts)).T)

[[b'0' b'8626']
 [b'1' b'800']]


In [65]:
# Posts written in English : train set
unique, counts = np.unique(Y_train_eng, return_counts=True)
print (np.asarray((unique, counts)).T)

[[b'0' b'5156']
 [b'1' b'499']]


In [66]:
# Posts written in English from reduced dataframe: train set
unique, counts = np.unique(Y_eng_reduced, return_counts=True)
print (np.asarray((unique, counts)).T)

[[b'0' b'428']
 [b'1' b'41']]


In [73]:
# Posts written in English from reduced dataframe with same amount True and False as the other dataframe: train set
unique, counts = np.unique(Y_train_eng_reduced_weighted, return_counts=True)
print (np.asarray((unique, counts)).T)

[[b'0' b'276']
 [b'1' b'5']]


In [71]:
# Posts written in Italian: train set
# number of datapoints labeled False and True
unique, counts = np.unique(Y_train_ita, return_counts=True)
print (np.asarray((unique, counts)).T)

[[b'0' b'273']
 [b'1' b'8']]


In [74]:
# Bag of words for english posts
# this class create steems words and ignore stopwords
# using CountVectorizer methods each unique word in the dictionary will correspond to a feature
stemmer_eng = SnowballStemmer("english", ignore_stopwords=False)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer_eng.stem(w) for w in analyzer(doc)])
stemmed_count_vect_eng = StemmedCountVectorizer(stop_words='english')

In [75]:
# Run ML algorithm
# build pipeline for Naive Bayes classifier
# control for Term Frequencies (i.e. #count(word) / #Total words)
# control for Term Frequency times inverse document frequency (i.e. weightage of more common words like the, is, an etc.) 
text_stemmed_eng = Pipeline([('vect', stemmed_count_vect_eng),
                      ('tfidf', TfidfTransformer()),
                      ('mnb', MultinomialNB(fit_prior=False))])

In [76]:
# Performance of NB Classifier for posts in english
text_stemmed_eng = text_stemmed_eng.fit(X_train_eng, Y_train_eng)
predicted_stemmed_eng = text_stemmed_eng.predict(X_test_eng)
np.mean(predicted_stemmed_eng == Y_test_eng)

0.93450013259082476

In [18]:
# Performance of NB Classifier for posts in english dataframe reduced
text_stemmed_eng_reduced = text_stemmed_eng.fit(X_train_eng_reduced, Y_train_eng_reduced)
predicted_stemmed_eng_reduced = text_stemmed_eng.predict(X_test_eng_reduced)
np.mean(predicted_stemmed_eng_reduced == Y_test_eng_reduced)

0.91489361702127658

In [78]:
# Performance of NB Classifier for posts in english dataframe reduced
text_stemmed_eng_reduced_weighted = text_stemmed_eng.fit(X_train_eng_reduced_weighted, Y_train_eng_reduced_weighted)
predicted_stemmed_eng_reduced_weighted = text_stemmed_eng.predict(X_test_eng_reduced_weighted)
np.mean(predicted_stemmed_eng_reduced_weighted == Y_test_eng_reduced_weighted)

0.97340425531914898

In [14]:
# Bag of words for italian posts
stemmer_ita = ItalianStemmer(ignore_stopwords=False)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer_ita.stem(w) for w in analyzer(doc)])
stemmed_count_vect_ita = StemmedCountVectorizer()

In [15]:
# build pipeline for Naive Bayes classifier
text_stemmed_ita = Pipeline([('vect', stemmed_count_vect_ita),
                      ('tfidf', TfidfTransformer()),
                      ('mnb', MultinomialNB(fit_prior=False))])

In [16]:
# Performance of NB Classifier for posts in italian
text_stemmed_ita = text_stemmed_ita.fit(X_train_ita, Y_train_ita)
predicted_stemmed_ita = text_stemmed_ita.predict(X_test_ita)
np.mean(predicted_stemmed_ita == Y_test_ita)

0.98936170212765961

In [None]:
# Grid Search for english posts
# List of parameters for which we would like to do performance tuning. 
# vect__ngram_range is telling to use unigram and bigrams and choose the optimal
parameters_eng = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3)}

# parameters and n_jobs=-1 which tells to use multiple cores from user machine
gs_clf_eng = GridSearchCV(text_stemmed_eng, parameters_eng, n_jobs=-1)
gs_clf_eng = gs_clf_eng.fit(X_train_eng, Y_train_eng)

In [None]:
# measure performance for english posts
print(gs_clf_eng.best_score_)
print(gs_clf_eng.best_params_)

In [None]:
# Grid Search for italian posts
parameters_ita = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],  #from unigrams to fourgrams
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3)}

gs_clf_ita = GridSearchCV(text_ita_stemmed, parameters_ita, n_jobs=-1)
gs_clf_ita = gs_clf_ita.fit(X_train_ita, Y_train_ita)

In [None]:
# measure performance for italian posts
print(gs_clf_ita.best_score_)
print(gs_clf_ita.best_params_)

In [None]:
# Future improvements: TODO

# implement a more general class StemmedCountVectorizer allowing for multiple languages 
# try with one bag of words and one classifier for both languages
# try with a sample of the same size (50:50), and with different size (80:20)
# try Gaussian, and Bernoulli Naive Bayes
# Smooth parameters