In [84]:
# import libraries
import pandas as pd
import numpy as np
# to transform words in vectors and to build and organize models
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
# to process words
# to download nltk packages de-comment next line
# nltk.download() 
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.snowball import ItalianStemmer

In [310]:
# import data
src = pd.read_csv('../../Data/tribe_dynamics_data.csv')
src.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'brand_id', 'worker_id', 'mturker',
       'post_hash', 'answer', 'date', 'duration_seconds', 'text',
       'model_decision', 'timestamped_model', 'lang'],
      dtype='object')

In [311]:
# import data
src2 = pd.read_json('../../Data/CSE_20180215/14680_data.json')
src2.columns

Index(['labels', 'lang', 'link', 'model_decision', 'mturker', 'text'], dtype='object')

In [312]:
def labels_list_unpacking(list_of_label_lists):
    # Count number of True and False and convert with majority
    new_labels_list = []

    for label_list in list_of_label_lists:
        labels_counter = Counter(label_list)
        if labels_counter[0] >= labels_counter[1]:  # Prefer false negatives to false positives
            new_labels_list.append(False)
        else:
            new_labels_list.append(True)

    return np.array(new_labels_list)


def replace_label_column_in_df(df):
    # Converts list of labels into True or False
    # new_labels_arr = labels_list_unpacking(df.labels.values) # ERROR: 'DataFrame' object has no attribute 'labels' (Why???)
    new_labels_arr = labels_list_unpacking(df.iloc[:, 0].values)

    df['answer'] = new_labels_arr

    # Return df without 'labels' column, replaced by 'answer' one
    return df[['lang', 'link', 'model_decision', 'mturker', 'text', 'answer']]

In [313]:
src2 = replace_label_column_in_df(src2)
src2.columns

Index(['lang', 'link', 'model_decision', 'mturker', 'text', 'answer'], dtype='object')

In [314]:
src=src2[['answer','text','lang']]
multi_languages = src[(src.lang == 'en') | (src.lang == 'it')]
mono_language_ita = src[src['lang'] == 'it']
mono_language_eng = src[src['lang'] == 'en']

In [315]:
from collections import Counter
def imbalance_ratio(labels_arr):
    counter_obj = Counter(labels_arr)
    num_true = counter_obj[True]
    num_false = counter_obj[False]
    if (num_true == 0) and (num_false == 0):  # Avoid division by zero
        return 0
    else:
        return 1 - (min(num_true, num_false) / max(num_true, num_false))

In [316]:
print('texts in all languages have an imbalance ratio is',imbalance_ratio(multi_languages['answer']))
print('texts in english have an imbalance ratio is',imbalance_ratio(mono_language_eng['answer']))
print('texts in italian have an imbalance ratio is',imbalance_ratio(mono_language_ita['answer']))

texts in all languages have an imbalance ratio is 0.10113302559798576
texts in english have an imbalance ratio is 0.4490582191780822
texts in italian have an imbalance ratio is 0.9450292397660819


In [317]:
# convert False to 0s and True to 1s
src['answer'] = (src['answer'] == True).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [318]:
# select posts language (i.e. english and italian)
src=src[['answer','text','lang']]
df_all = src[(src.lang == 'en') | (src.lang == 'it')]
df_ita = src[src['lang'] == 'it']
df_eng = src[src['lang'] == 'en']
print(df_all.shape, df_ita.shape, df_eng.shape)

(4525, 3) (902, 3) (3623, 3)


In [319]:
# randomely extract a data sample of same name observation as other dataframe
df_eng_reduced = df_eng.sample(n=df_ita.shape[0])
df_eng_reduced.shape

(902, 3)

In [320]:
#divide df in english among true and false
df_eng_true = df_eng[df_eng['answer'] == True]
df_eng_false = df_eng[df_eng['answer'] == False]
#randomely select true and false as in the dataframe of posts in italian
df_eng_reduced_true = df_eng_true.sample(n=df_ita[df_ita['answer'] == True].shape[0])
df_eng_reduced_false = df_eng_false.sample(n=df_ita[df_ita['answer'] == False].shape[0])
#combine true and false of the reduced dataframe
df_eng_reduced_weighted = pd.concat([df_eng_reduced_true, df_eng_reduced_false])
df_eng_reduced_weighted.shape

(902, 3)

In [321]:
# convert text and classifier in arrays
X_all = np.asarray(df_all['text'])
Y_all = np.asarray(df_all['answer'], dtype="|S6")
X_eng = np.asarray(df_eng['text'])
Y_eng = np.asarray(df_eng['answer'], dtype="|S6")
X_eng_reduced = np.asarray(df_eng_reduced['text'])
Y_eng_reduced = np.asarray(df_eng_reduced['answer'], dtype="|S6")
X_eng_reduced_weighted = np.asarray(df_eng_reduced_weighted['text'])
Y_eng_reduced_weighted = np.asarray(df_eng_reduced_weighted['answer'], dtype="|S6")
X_ita = np.asarray(df_ita['text'])
Y_ita = np.asarray(df_ita['answer'], dtype="|S6")

In [322]:
# split in train and test
X_train_all, X_test_all, Y_train_all, Y_test_all = train_test_split(X_all, Y_all, test_size=0.4, random_state=42)
X_train_eng, X_test_eng, Y_train_eng, Y_test_eng = train_test_split(X_eng, Y_eng, test_size=0.4, random_state=42)
X_train_eng_reduced, X_test_eng_reduced, Y_train_eng_reduced, Y_test_eng_reduced = train_test_split(X_eng_reduced, Y_eng_reduced, test_size=0.4, random_state=42)
X_train_eng_reduced_weighted, X_test_eng_reduced_weighted, Y_train_eng_reduced_weighted, Y_test_eng_reduced_weighted = train_test_split(X_eng_reduced_weighted, Y_eng_reduced_weighted, test_size=0.4, random_state=42)
X_train_ita, X_test_ita, Y_train_ita, Y_test_ita = train_test_split(X_ita, Y_ita, test_size=0.4, random_state=42)

In [323]:
# Posts written in English
# number of datapoints labeled False and True
unique, counts = np.unique(Y_eng, return_counts=True)
print (np.asarray((unique, counts)).T)

[[b'0' b'1287']
 [b'1' b'2336']]


In [324]:
# Posts written in English : train set
unique, counts = np.unique(Y_train_eng, return_counts=True)
print (np.asarray((unique, counts)).T)

[[b'0' b'788']
 [b'1' b'1385']]


In [325]:
# Posts written in English from reduced dataframe: train set
unique, counts = np.unique(Y_eng_reduced, return_counts=True)
print (np.asarray((unique, counts)).T)

[[b'0' b'290']
 [b'1' b'612']]


In [326]:
# Posts written in English from reduced dataframe with same amount True and False as the other dataframe: train set
unique, counts = np.unique(Y_train_eng_reduced_weighted, return_counts=True)
print (np.asarray((unique, counts)).T)

[[b'0' b'512']
 [b'1' b'29']]


In [327]:
# Posts written in Italian: train set
# number of datapoints labeled False and True
unique, counts = np.unique(Y_train_ita, return_counts=True)
print (np.asarray((unique, counts)).T)

[[b'0' b'511']
 [b'1' b'30']]


In [328]:
# Bag of words for english posts
# this class create steems words and ignore stopwords
# using CountVectorizer methods each unique word in the dictionary will correspond to a feature
stemmer_eng = SnowballStemmer("english", ignore_stopwords=False)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer_eng.stem(w) for w in analyzer(doc)])
stemmed_count_vect_eng = StemmedCountVectorizer(stop_words='english')

In [329]:
# Run ML algorithm
# build pipeline for Naive Bayes classifier
# control for Term Frequencies (i.e. #count(word) / #Total words)
# control for Term Frequency times inverse document frequency (i.e. weightage of more common words like the, is, an etc.) 
text_stemmed_eng = Pipeline([('vect', stemmed_count_vect_eng),
                      ('tfidf', TfidfTransformer()),
                      ('mnb', MultinomialNB(fit_prior=False))])

In [330]:
# Performance of NB Classifier for posts in english
text_stemmed_eng = text_stemmed_eng.fit(X_train_eng, Y_train_eng)
predicted_stemmed_eng = text_stemmed_eng.predict(X_test_eng)
np.mean(predicted_stemmed_eng == Y_test_eng)

0.84896551724137936

In [331]:
# store the predicted probabilities for class 1
y_pred_prob_stemmed_eng = text_stemmed_eng.predict_proba(X_test_eng)[:, 1]
#print first ten probabilities
y_pred_prob_stemmed_eng[0:10]

array([ 0.9369737 ,  0.90159827,  0.61559327,  0.64240181,  0.47063031,
        0.89216549,  0.98486342,  0.53921185,  0.24284458,  0.17446629])

In [332]:
#Y_test_eng
Y_test_eng2 = np.array(Y_test_eng, dtype='int')

from sklearn import metrics
#from sklearn.metrics import roc_auc_score
#from sklearn.metrics import average_precision_score

#print(metrics.precision_score(y_test, y_pred_class))

#accuracy
print(metrics.accuracy_score(Y_test_eng, predicted_stemmed_eng))
#ROC score
print(metrics.roc_auc_score(Y_test_eng2, y_pred_prob_stemmed_eng))
#average precision
print(metrics.average_precision_score(Y_test_eng2, y_pred_prob_stemmed_eng))

0.848965517241
0.945903373519
0.97054187543


In [336]:
#extract words from the vectorizer and probabilities for each word from the NB model trained
list_words=text_stemmed_eng.named_steps['vect'].get_feature_names()
list_prob=text_stemmed_eng.named_steps['mnb'].feature_log_prob_[1]
#zip words and probabilities into a tuple
words_prob=list(zip(list_words, list_prob))
#take words with highest probability for the dove classifier of english posts
words_high_probability = [t[0] for t in words_prob if t[1] > - 6.40]
words_high_probability

['beauti', 'bodi', 'care', 'dove', 'hair', 'product']

In [337]:
#print frequent words' probabilities
print('prob 1st',[t[1] for t in words_prob if t[0] == words_high_probability[0]])
print('prob 2',[t[1] for t in words_prob if t[0] == words_high_probability[1]])
print('prob 3',[t[1] for t in words_prob if t[0] == words_high_probability[2]])
print('prob 4',[t[1] for t in words_prob if t[0] == words_high_probability[3]])
print('prob 5',[t[1] for t in words_prob if t[0] == words_high_probability[4]])
print('prob 6th',[t[1] for t in words_prob if t[0] == words_high_probability[5]])

prob 1st [-6.1802252269748479]
prob 2 [-6.2884714819375054]
prob 3 [-6.0461948848231852]
prob 4 [-5.3005016698314584]
prob 5 [-6.3497405390604413]
prob 6th [-6.304891317261955]


In [None]:
# Grid Search for english posts
# List of parameters for which we would like to do performance tuning. 
# vect__ngram_range is telling to use unigram and bigrams and choose the optimal
parameters_eng = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3)}

#pipeline, parameters, n_jobs=-1, verbose=1)

# parameters and n_jobs=-1 which tells to use multiple cores from user machine
gs_clf_eng = GridSearchCV(text_stemmed_eng, parameters_eng, n_jobs = 2,verbose=10) #, n_jobs=-1
gs_clf_eng = gs_clf_eng.fit(X_train_eng, Y_train_eng)

In [215]:
# italian posts

In [338]:
# Bag of words for italian posts
stemmer_ita = ItalianStemmer(ignore_stopwords=False)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer_ita.stem(w) for w in analyzer(doc)])
stemmed_count_vect_ita = StemmedCountVectorizer()

In [339]:
# build pipeline for Naive Bayes classifier
text_stemmed_ita = Pipeline([('vect', stemmed_count_vect_ita),
                      ('tfidf', TfidfTransformer()),
                      ('mnb', MultinomialNB(fit_prior=False))])

In [340]:
# Performance of NB Classifier for posts in italian
text_stemmed_ita = text_stemmed_ita.fit(X_train_ita, Y_train_ita)
predicted_stemmed_ita = text_stemmed_ita.predict(X_test_ita)
np.mean(predicted_stemmed_ita == Y_test_ita)

0.96121883656509699

In [341]:
# store the predicted probabilities for class 1
y_pred_prob_stemmed_ita = text_stemmed_ita.predict_proba(X_test_ita)[:, 1]
#print first ten probabilities
y_pred_prob_stemmed_ita[0:10]

array([ 0.1101371 ,  0.13361574,  0.22257561,  0.18704327,  0.05405698,
        0.07035018,  0.18847392,  0.02618945,  0.07749913,  0.11917031])

In [342]:
#Y_test_eng
Y_test_ita2 = np.array(Y_test_ita, dtype='int')
#accuracy
print(metrics.accuracy_score(Y_test_ita, predicted_stemmed_ita))
#ROC score
print(metrics.roc_auc_score(Y_test_ita2, y_pred_prob_stemmed_ita))
#average precision
print(metrics.average_precision_score(Y_test_ita2, y_pred_prob_stemmed_ita))

0.961218836565
0.763508891929
0.352013560566


In [343]:
#extract words from the vectorizer and probabilities for each word from the NB model trained
list_words_ita=text_stemmed_ita.named_steps['vect'].get_feature_names()
list_prob_ita=text_stemmed_ita.named_steps['mnb'].feature_log_prob_[1]
#zip words and probabilities into a tuple
words_prob_ita=list(zip(list_words_ita, list_prob_ita))
#take words with highest probability for the dove classifier of english posts
words_high_probability_ita = [t[0] for t in words_prob_ita if t[1] > - 7.819]
words_high_probability_ita

['bagnoschium', 'donn', 'dov', 'il', 'per', 'scopr']

In [344]:
#print frequent words' probabilities
print('prob 1st',[t[1] for t in words_prob_ita if t[0] == words_high_probability_ita[0]])
print('prob 2',[t[1] for t in words_prob_ita if t[0] == words_high_probability_ita[1]])
print('prob 3',[t[1] for t in words_prob_ita if t[0] == words_high_probability_ita[2]])
print('prob 4',[t[1] for t in words_prob_ita if t[0] == words_high_probability_ita[3]])
print('prob 5',[t[1] for t in words_prob_ita if t[0] == words_high_probability_ita[4]])
print('prob 6th',[t[1] for t in words_prob_ita if t[0] == words_high_probability_ita[5]])

prob 1st [-7.8184415258763496]
prob 2 [-7.7176600530044608]
prob 3 [-7.5270358451820183]
prob 4 [-7.6986028982367083]
prob 5 [-7.7917494575301278]
prob 6th [-7.7136145057812691]


In [None]:
#English reduced and weighted

In [345]:
# Bag of words for english posts
# this class create steems words and ignore stopwords
# using CountVectorizer methods each unique word in the dictionary will correspond to a feature
stemmer_eng = SnowballStemmer("english", ignore_stopwords=False)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer_eng.stem(w) for w in analyzer(doc)])
stemmed_count_vect_eng = StemmedCountVectorizer(stop_words='english')

In [346]:
# Run ML algorithm
# build pipeline for Naive Bayes classifier
# control for Term Frequencies (i.e. #count(word) / #Total words)
# control for Term Frequency times inverse document frequency (i.e. weightage of more common words like the, is, an etc.) 
text_stemmed_eng = Pipeline([('vect', stemmed_count_vect_eng),
                      ('tfidf', TfidfTransformer()),
                      ('mnb', MultinomialNB(fit_prior=False))])

In [347]:
# Performance of NB Classifier for posts in english dataframe reduced
text_stemmed_eng_reduced_weighted = text_stemmed_eng.fit(X_train_eng_reduced_weighted, Y_train_eng_reduced_weighted)
predicted_stemmed_eng_reduced_weighted = text_stemmed_eng.predict(X_test_eng_reduced_weighted)
np.mean(predicted_stemmed_eng_reduced_weighted == Y_test_eng_reduced_weighted)

0.94459833795013848

In [348]:
# store the predicted probabilities for class 1
y_pred_prob_stemmed_eng_red_weight = text_stemmed_eng_reduced_weighted.predict_proba(X_test_ita)[:, 1]
#print first ten probabilities
y_pred_prob_stemmed_eng_red_weight[0:10]

array([ 0.3884763 ,  0.36269865,  0.1193639 ,  0.50981856,  0.37682377,
        0.41098333,  0.36544023,  0.43208678,  0.45923406,  0.43031114])

In [349]:
#Y_test_eng
Y_test_eng2_red_weight = np.array(Y_test_eng_reduced_weighted, dtype='int')

from sklearn import metrics
#from sklearn.metrics import roc_auc_score
#from sklearn.metrics import average_precision_score

#print(metrics.precision_score(y_test, y_pred_class))

#accuracy
print(metrics.accuracy_score(Y_test_eng_reduced_weighted, predicted_stemmed_eng_reduced_weighted))
#ROC score
print(metrics.roc_auc_score(Y_test_eng2_red_weight, y_pred_prob_stemmed_eng_red_weight))
#average precision
print(metrics.average_precision_score(Y_test_eng2_red_weight, y_pred_prob_stemmed_eng_red_weight))

0.94459833795
0.435941043084
0.0429233381304


In [357]:
#extract words from the vectorizer and probabilities for each word from the NB model trained
list_words_eng_red_weight=text_stemmed_eng_reduced_weighted.named_steps['vect'].get_feature_names()
list_prob_eng_red_weight=text_stemmed_eng_reduced_weighted.named_steps['mnb'].feature_log_prob_[1]
#zip words and probabilities into a tuple
words_prob_eng_red_weight=list(zip(list_words_eng_red_weight, list_prob_eng_red_weight))
#take words with highest probability for the dove classifier of english posts
words_high_probability_eng_red_weight = [t[0] for t in words_prob_eng_red_weight if t[1] > - 7.78]
words_high_probability_eng_red_weight

['bodi', 'care', 'dove', 'dri', 'hair', 'product']

In [358]:
#print frequent words' probabilities
print('prob 1st',[t[1] for t in words_prob_eng_red_weight if t[0] == words_high_probability_eng_red_weight[0]])
print('prob 2',[t[1] for t in words_prob_eng_red_weight if t[0] == words_high_probability_eng_red_weight[1]])
print('prob 3',[t[1] for t in words_prob_eng_red_weight if t[0] == words_high_probability_eng_red_weight[2]])
print('prob 4',[t[1] for t in words_prob_eng_red_weight if t[0] == words_high_probability_eng_red_weight[3]])
print('prob 5',[t[1] for t in words_prob_eng_red_weight if t[0] == words_high_probability_eng_red_weight[4]])
print('prob 6th',[t[1] for t in words_prob_eng_red_weight if t[0] == words_high_probability_eng_red_weight[5]])

prob 1st [-7.7262878214566131]
prob 2 [-7.5834467806380816]
prob 3 [-7.5276208095301822]
prob 4 [-7.7740204544286629]
prob 5 [-7.4419484410837473]
prob 6th [-7.7401631537230111]


In [212]:
# Grid Search for italian posts
parameters_ita = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],  #from unigrams to fourgrams
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3)}

gs_clf_ita = GridSearchCV(text_ita_stemmed, parameters_ita, n_jobs=-1)
gs_clf_ita = gs_clf_ita.fit(X_train_ita, Y_train_ita)

NameError: name 'text_ita_stemmed' is not defined

In [None]:
# Performance of NB Classifier for posts in english dataframe reduced
text_stemmed_eng_reduced = text_stemmed_eng.fit(X_train_eng_reduced, Y_train_eng_reduced)
predicted_stemmed_eng_reduced = text_stemmed_eng.predict(X_test_eng_reduced)
np.mean(predicted_stemmed_eng_reduced == Y_test_eng_reduced)

In [None]:
# measure performance for english posts
print(gs_clf_eng.best_score_)
print(gs_clf_eng.best_params_)

In [None]:
# measure performance for italian posts
print(gs_clf_ita.best_score_)
print(gs_clf_ita.best_params_)

In [None]:
# Future improvements: TODO

# implement a more general class StemmedCountVectorizer allowing for multiple languages 
# try with one bag of words and one classifier for both languages
# try with a sample of the same size (50:50), and with different size (80:20)
# try Gaussian, and Bernoulli Naive Bayes
# Smooth parameters