In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string

In [2]:
data = pd.read_csv("../raw_data/cognitive-bias.csv")

In [3]:
data.shape

(7092, 4)

In [4]:
data.head(10)

Unnamed: 0,id,text,label,dataset_id
0,072-9268,To vote in Mexico every eligible Mexican citiz...,0,72
1,072-6808,Marco Rubio made it clear Not only does he thi...,0,72
2,072-5150,Ben Carson is running for president to elimina...,0,72
3,072-11817,There are literally teachers now who are getti...,0,72
4,072-7176,Leticia Van de Putte voted to stop schools fro...,0,72
5,072-7159,Says U S Senate rival Tommy Thompson said he w...,0,72
6,072-3591,Says Democratic challenger Russ Feingold was t...,0,72
7,072-8756,Obamacare has caused millions of full time job...,0,72
8,072-5946,Our own National Security Agency says that Ir...,0,72
9,072-4785,Says none of the government programs targeted ...,0,72


In [5]:
X = data['text']
y = data['label']
print(X.shape)
print(y.shape)

(7092,)
(7092,)


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print(X_train.shape)
print(X_test.shape)

(5319,)
(1773,)


# 1. Cleaner

In [7]:
from nltk import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def cleaning(series):
    def cleaning_sentence(sentence):
        """ takes a sentence (string) as input and returns
        same string but fully cleaned """
        
        # Basic cleaning
        sentence = sentence.strip() ## remove whitespaces
        sentence = sentence.lower() ## lowercase 
        sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers
        
        # Advanced cleaning
        for punctuation in string.punctuation:
            sentence = sentence.replace(punctuation, '') ## remove punctuation
        
        tokenized_sentence = word_tokenize(sentence) ## tokenize 
        
        stop_words = set(stopwords.words('english')) ## define stopwords
        
        tokenized_sentence_cleaned = [ ## remove stopwords
            w for w in tokenized_sentence if not w in stop_words
        ]
        
        # Lemmatizing
        lemmatized_verbs = [WordNetLemmatizer().lemmatize(word, pos = "v") for word in tokenized_sentence_cleaned]
        lemmatized_nouns = [WordNetLemmatizer().lemmatize(word, pos = "n") for word in lemmatized_verbs]
        lemmatized_adj = [WordNetLemmatizer().lemmatize(word, pos = "a") for word in lemmatized_nouns]
        lemmatized_adv = [WordNetLemmatizer().lemmatize(word, pos = "r") for word in lemmatized_adj]
        
        cleaned_sentence = ' '.join(word for word in lemmatized_adv)
        
        return cleaned_sentence
    
    return series.apply(cleaning_sentence)

In [10]:
from sklearn.preprocessing import FunctionTransformer

cleaner = FunctionTransformer(cleaning)

# 2. Gridsearching the best pipeline

***WARNING***: for an unknown reason, i am not able to do gridsearching when incleading the custom cleaner in the pipelie. So for the purpose of the gridsearching, i am cleaning the data before doing the grid search. But for the final pipeline (section 5), cleaning is included in the pipe

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [14]:
X_train_cleaned = cleaner.fit_transform(X_train)

In [25]:
# ngram ((1,1), (2,2), (3,3), (2,3)),
# min_df: 5, 10, 30, 50
# max_df: 0.75, 0.85, 0.95
# nalpha: 0.01, 0.1, 1, 10

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logreg', LogisticRegression(penalty='l2')),
])

# Set parameters to search
parameters = {
    #'tfidf__ngram_range': ((1,1),(2,2)),
    'tfidf__min_df': (3,5,10),
    'tfidf__max_df': (0.7,0.75),
    #'logreg__penalty': ('l1', 'l2'),
    'logreg__C': (0.01,0.1,0.3),
    'logreg__solver': ('liblinear', 'newton-cg', 'lbfgs')
}

# Perform grid search on pipeline
grid_search = GridSearchCV(
    pipeline, parameters, n_jobs=-1, 
    verbose=1, scoring = "accuracy", 
    cv=4
)

grid_search.fit(X_train_cleaned,y_train)

Fitting 4 folds for each of 54 candidates, totalling 216 fits


In [26]:
best_model = grid_search.best_estimator_
best_params = best_model.get_params()
best_score = grid_search.best_score_
print(best_params)
print(best_score)

{'memory': None, 'steps': [('tfidf', TfidfVectorizer(max_df=0.7, min_df=10)), ('logreg', LogisticRegression(C=0.1, solver='liblinear'))], 'verbose': False, 'tfidf': TfidfVectorizer(max_df=0.7, min_df=10), 'logreg': LogisticRegression(C=0.1, solver='liblinear'), 'tfidf__analyzer': 'word', 'tfidf__binary': False, 'tfidf__decode_error': 'strict', 'tfidf__dtype': <class 'numpy.float64'>, 'tfidf__encoding': 'utf-8', 'tfidf__input': 'content', 'tfidf__lowercase': True, 'tfidf__max_df': 0.7, 'tfidf__max_features': None, 'tfidf__min_df': 10, 'tfidf__ngram_range': (1, 1), 'tfidf__norm': 'l2', 'tfidf__preprocessor': None, 'tfidf__smooth_idf': True, 'tfidf__stop_words': None, 'tfidf__strip_accents': None, 'tfidf__sublinear_tf': False, 'tfidf__token_pattern': '(?u)\\b\\w\\w+\\b', 'tfidf__tokenizer': None, 'tfidf__use_idf': True, 'tfidf__vocabulary': None, 'logreg__C': 0.1, 'logreg__class_weight': None, 'logreg__dual': False, 'logreg__fit_intercept': True, 'logreg__intercept_scaling': 1, 'logreg__l

# 3. Full pipeline

In [27]:
from sklearn.preprocessing import FunctionTransformer

cleaner = FunctionTransformer(cleaning)

In [28]:
vectorizer = TfidfVectorizer(min_df=10, max_df=0.7, ngram_range=(1, 1))

In [29]:
model = LogisticRegression(solver='liblinear',C=0.1)

In [30]:
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(cleaner, vectorizer, model)

pipeline.fit(X_train, y_train)

In [31]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

predictions = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
recall = recall_score(y_test, predictions)
precision = precision_score(y_test, predictions)

print("Accuracy:", accuracy)
print("F1:", f1)
print("recall", recall)
print("precision", precision)

Accuracy: 0.5843203609701072
F1: 0.6039763567974206
recall 0.6504629629629629
precision 0.563691073219659
