# NLP Assignment 1 (40% of grade): Text classification for Fake News Detection - SOLUTION Q6

In [1]:
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier, NaiveBayesClassifier
from sklearn.linear_model import LogisticRegression
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support # to report on precision and recall
import numpy as np

In [2]:
# some useful libraries
from nltk.stem import WordNetLemmatizer  # lemmatization
import nltk # for accessing the stopwords etc.
import re # regex
import string # other string operations
from textblob import TextBlob # for spelling correction

In [3]:
# different pre-processing techniques which get called altogether by pre_process

# method to deal with number words being normalized to digits 

def text2int(textnum, numwords={}):
    if not numwords:
        units = [
        "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen",
        ]

        tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]

        scales = ["hundred", "thousand", "million", "billion", "trillion"]

        numwords["and"] = (1, 0)
        for idx, word in enumerate(units):  numwords[word] = (1, idx)
        for idx, word in enumerate(tens):       numwords[word] = (1, idx * 10)
        for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)

    ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
    ordinal_endings = [('ieth', 'y'), ('th', '')]

    textnum = textnum.replace('-', ' ')

    current = result = 0
    curstring = ""
    onnumber = False
    for word in textnum.split():
        if word in ordinal_words:
            scale, increment = (1, ordinal_words[word])
            current = current * scale + increment
            if scale > 100:
                result += current
                current = 0
            onnumber = True
        else:
            for ending, replacement in ordinal_endings:
                if word.endswith(ending):
                    word = "%s%s" % (word[:-len(ending)], replacement)

            if word not in numwords:
                if onnumber:
                    curstring += repr(result + current) + " "
                curstring += word + " "
                result = current = 0
                onnumber = False
            else:
                scale, increment = numwords[word]

                current = current * scale + increment
                if scale > 100:
                    result += current
                    current = 0
                onnumber = True

    if onnumber:
        curstring += repr(result + current)

    return curstring

def tokenize_text(text):
    if preprocessing_switches["separate_out_punctuation"]:
        text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", text) # separates punctuation at ends of strings
        text = re.sub(r"([.,;:!?'\"“\(\)])(\w)", r"\1 \2", text) # separates punctuation at beginning of strings
    if preprocessing_switches["convert_numbers"]:
        text = re.sub('\d+', 'NUMBER',text)
    # print("tokenising:", text) # uncomment for debugging
    tokens = text.split()
    return tokens

def remove_characters_after_tokenization(tokens):
    # note preserving critical social media/twitter characters @ and #
    p = '[{}]'.format(re.escape(string.punctuation)+'\…').replace("@", "").replace("\#", "")
    #print(p)
    pattern = re.compile(p)
    filtered_tokens = [f for f in filter(None, [pattern.sub('', token) for token in tokens])]
    return filtered_tokens

def convert_to_lowercase(tokens):
    return [token.lower() for token in tokens if token.isalpha()]

def remove_stopwords(tokens):
    stopword_list = nltk.corpus.stopwords.words('english')
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    return filtered_tokens

def apply_lemmatization(tokens, wnl=WordNetLemmatizer()):   
    return [wnl.lemmatize(token) for token in tokens]

def pre_process(text):
    """ Technique which will apply the techniques if they are set to 
    True in the global dict ::preprocessing_switches::
    """
    if preprocessing_switches["convert_usernames"]:
        text = re.sub("@[a-zA-Z0-9:.]+", "@username", text)
    if preprocessing_switches["convert_number_words_to_digits"]:
        text = text2int(text)
    tokens = tokenize_text(text)
    if preprocessing_switches["remove_punctuation"]:
        tokens = remove_characters_after_tokenization(tokens)
    if preprocessing_switches["convert_to_lowercase"]:
        tokens = convert_to_lowercase(tokens)
    if preprocessing_switches["remove_stopwords"]:
        tokens = remove_stopwords(tokens)
    if preprocessing_switches["apply_lemmatization"]:
        tokens = apply_lemmatization(tokens)
    return tokens

In [5]:
# Change cross-val function to allow first fold only option
from sklearn.metrics import classification_report

def cross_validate(dataset, folds, first_fold_only=False):
    results = []
    fold_size = int(len(dataset)/folds) + 1
    
    for i in range(0,len(dataset),int(fold_size)):
        # insert code here that trains and tests on the 10 folds of data in the dataset
        print("Fold start on items %d - %d" % (i, i+fold_size))
        
        fold_test_data = dataset[i:i+fold_size]   # get test split on this fold
        fold_train_data = dataset[:i] + dataset[i+fold_size:] # get train split on this fold
        classifier = train_classifier(fold_train_data) # train classifier on the training data
        y_true = [x[1] for x in fold_test_data] # get ground-truth labels
        y_pred = predict_labels([x[0] for x in fold_test_data], classifier) # use classifier to predict
        results.append(precision_recall_fscore_support(y_true, y_pred, average='weighted')) # get results
        # print(classification_report(y_true,y_pred))  # see classification report for fold
        
        #alternative: focus on the FAKE label accuracy only
        #report = classification_report(y_true, y_pred, output_dict=True)
        #results.append([report["FAKE"]['precision'], report["FAKE"]['recall'], report["FAKE"]['f1-score']]) # focus on FAKE
        if first_fold_only:
            break # quicker version only using one fold
        
    avg_results = [np.mean([x[0] for x in results]),
                   np.mean([x[1] for x in results]),
                   np.mean([x[2] for x in results])
                ]
    return avg_results

In [6]:
# For now just use the answer to Q2 for feature extraction (unigram bow binary)
global_feature_dict = {} # A global dictionary of features

# Solution from Q5
from collections import Counter

def to_feature_vector(tokens):
    # SOLUTION: a method to extract different ngram sequences from tokens
    # and different weighting on those counts
    
    feature_vector_dict =  Counter()  # local feature vector for counts
    
    # collect the counts for all n in range (1,_N_)
    for n in range(1,_N_+1):
        new_tokens = ["<s>"]*(n-1) + tokens + ["</s>"]
        for i in range(n-1, len(new_tokens)):
            raw_ngram = " ".join(new_tokens[i-(n-1):i+1])
            #print(raw_ngram)
            n_gram = "{}@{}".format(n, raw_ngram)
            #print(n_gram)
            feature_vector_dict[n_gram]+=1
    
    # if _WEIGHT_ is 'counts' then this has already been done
    if _WEIGHT_ == "binary":
        feature_vector_dict = {x:1 for x in feature_vector_dict.keys()}  # binary Set-of-Words
    elif _WEIGHT_ == "weighted":
        # bag-of-words counts 
        feature_vector_dict = {x:feature_vector_dict[x]/(len(tokens)+1) for x in feature_vector_dict.keys()}
    
    for feat,v in feature_vector_dict.items():
        if not feat in global_feature_dict:
            global_feature_dict[feat] = 1
        else:
            global_feature_dict[feat] +=1
            
    return feature_vector_dict

In [7]:
# BEST SETTINGS FROM QUESTION 5
# Best settings from joint optimization:
_WEIGHT_ = 'counts'
_N_ =  4
preprocessing_switches = {'convert_usernames': False,
  'separate_out_punctuation': False,
  'convert_number_words_to_digits': False,
  'convert_numbers': False,
  'remove_punctuation': True,
  'convert_to_lowercase': False,
  'remove_stopwords': True,
  'apply_lemmatization': True}

In [8]:
def train_classifier(data):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(data)

# PREDICTING LABELS GIVEN A CLASSIFIER

def predict_labels(samples, classifier):
    """Assuming preprocessed samples, return their predicted labels from the classifier model."""
    return classifier.classify_many(samples)

def predict_label_from_raw(sample, classifier):
    """Assuming raw text, return its predicted label from the classifier model."""
    return classifier.classify(to_feature_vector(preProcess(reviewSample)))

In [9]:
# Iterate over all combinations of pre-processing technique
from itertools import chain, combinations  # for powerset, to get all combinations

def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

# Q6: Changing functions to extract other features from file

In [10]:
def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            # create tuple of instance data with all 13 relevant fields
            instance_data = parse_data_line(line)
            raw_data.append(instance_data)
                           

def split_and_preprocess_data(percentage):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    
    # adjusting to do preprocessing on the text only, using extra_features as a dictionary
    for (text, label, extra_features) in raw_data[:num_training_samples]:
        train_data.append((to_feature_vector(pre_process(text), extra_features), label))
    for (text, label, extra_features) in raw_data[num_training_samples:]:
        test_data.append((to_feature_vector(pre_process(text), extra_features), label))

In [11]:
def convert_label(label):
    """Converts the multiple classes into two,
    making it a binary distinction between fake news and real."""
    #return label
    # Converting the multiclass labels to binary labels
    labels_map = {
        'true': 'REAL',
        'mostly-true': 'REAL',
        'half-true': 'REAL',
        'false': 'FAKE',
        'barely-true': 'FAKE',
        'pants-fire': 'FAKE'
    }
    return labels_map[label]


def parse_data_line(data_line):
    # Should return a tuple of the label as just FAKE or REAL and the statement
    #print(data_line)
    text = data_line[2]
    label = convert_label(data_line[1])
    header = ["subject", "speaker", "speaker_job_title", "state_info",
              "party_affiliation", "total_barely_true_counts", "total_false_counts",
              "total_half_true_counts", "total_mostly_true_counts",
              "total_pants_on_fire_counts", "context"]
    feat_values = data_line[3:]
    extra_features = {feat_name: feat_value for feat_name,feat_value in zip(header,feat_values)}
    return (text, label, extra_features)
     
    #return (None, None)

In [12]:
# PREDICTING LABELS GIVEN A CLASSIFIER
def train_classifier(data):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(data)

def predict_labels(samples, classifier):
    """Assuming preprocessed samples, return their predicted labels from the classifier model."""
    return classifier.classify_many(samples)

def predict_label_from_raw(sample, classifier):
    """Assuming raw text, return its predicted label from the classifier model."""
    return classifier.classify(to_feature_vector(preProcess(reviewSample)))

In [14]:
header = ["subject", "speaker", "speaker_job_title", "state_info",
              "party_affiliation", "total_barely_true_counts", "total_false_counts",
              "total_half_true_counts", "total_mostly_true_counts",
              "total_pants_on_fire_counts", "context"]

extra_feature_switches = {k: False for k in header}

In [15]:
def to_feature_vector(tokens, extra_features):
    # Should return a dictionary containing features as keys, and weights as values
  
    feature_vector_dict =  Counter()
    for n in range(1,_N_+1):
        new_tokens = ["<s>"]*(n-1) + tokens + ["</s>"]
        for i in range(n-1, len(new_tokens)):
            raw_ngram = " ".join(new_tokens[i-(n-1):i+1])
            #print(raw_ngram)
            n_gram = "{}@{}".format(n, raw_ngram)
            #print(n_gram)
            feature_vector_dict[n_gram]+=1
    if _WEIGHT_ == "binary":
        feature_vector_dict = {x:1 for x in feature_vector_dict.keys()}  # binary Set-of-Words
    elif _WEIGHT_ == "weighted":
        feature_vector_dict = {x:feature_vector_dict[x]/(len(tokens)+1) for x in feature_vector_dict.keys()}
    elif _WEIGHT_ == 'counts':
        feature_vector_dict = dict(feature_vector_dict)
    extra_features = {k:v for k,v in extra_features.items() if extra_feature_switches[k]}
    
      # first filter according to whether the extra feature is on (being used or not)
    # add all extra features except context
    feature_vector_dict.update({k: v for k, v in extra_features.items() if k!="context"})
    
    # add context counts/weights
    if extra_feature_switches["context"]:
        context = extra_features["context"]
        context_tokens = pre_process(context)
        context_counts = dict(Counter(context_tokens))
        context_counts = {"context_" + word : count for word,count in context_counts.items()}
        if _WEIGHT_ == "binary":
            countext_counts = {x:1 for x in context_counts.keys()}  # binary Set-of-Words
        elif _WEIGHT_ == "weighted":
            countext_counts = {x:context_counts[x]/(len(context_tokens)+1) for x in context_counts.keys()} 
        feature_vector_dict.update(context_counts)
    
    
    for f,v in feature_vector_dict.items():
        feat = f
        if f in ["subject", "speaker",
                      "speaker_job_title",
                      "state_info",
                      "party_affiliation",
                      "total_barely_true_counts",
                      "total_false_counts",
                      "total_half_true_counts",
                      "total_mostly_true_counts",
                      "total_pants_on_fire_counts"]:
            feat = f + "_" + str(v)
                
        if not feat in global_feature_dict:
            global_feature_dict[feat] = 1
        else:
            global_feature_dict[feat] +=1
            
    return feature_vector_dict


In [17]:
# Try adding all features first
extra_feature_switches = {k: True for k in header}
print(preprocessing_switches)
print(extra_feature_switches)
print("weights", _WEIGHT_)
print("n", _N_)
# loading reviews
# initialize global lists that will be appended to by the methods below
raw_data = []          # the filtered data from the dataset file
train_data = []        # the pre-processed training data as a percentage of the total dataset
test_data = []         # the pre-processed test data as a percentage of the total dataset


# references to the data files
data_file_path = 'fake_news.tsv'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing the dataset...",sep='\n')

load_data(data_file_path) 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing training and test data...",sep='\n')

split_and_preprocess_data(0.8)

# let's look at the representation of the first instance of training:
print(train_data[0])

# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Training Samples: ", len(train_data), "Features: ", len(global_feature_dict), sep='\n')

cross_validate(train_data, 10)


{'convert_usernames': False, 'separate_out_punctuation': False, 'convert_number_words_to_digits': False, 'convert_numbers': False, 'remove_punctuation': True, 'convert_to_lowercase': False, 'remove_stopwords': True, 'apply_lemmatization': True}
{'subject': True, 'speaker': True, 'speaker_job_title': True, 'state_info': True, 'party_affiliation': True, 'total_barely_true_counts': True, 'total_false_counts': True, 'total_half_true_counts': True, 'total_mostly_true_counts': True, 'total_pants_on_fire_counts': True, 'context': True}
weights counts
n 4
Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 10241 rawData, 0 trainData, 0 testData
Preparing training and test data...
({'1@Says': 1, '1@Annies': 1, '1@List': 1, '1@political': 1, '1@group': 1, '1@support': 1, '1@thirdtrimester': 1, '1@abortion': 1, '1@demand': 1, '1@</s>': 1, '2@<s> Says': 1, '2@Says Annies': 1, '2@Annies List': 1, '2@List political': 1, '2@political group': 1, '2@group support': 1, '2@support thirdtr

[0.7125313153231291, 0.7134050222275622, 0.7122348923890041]

# Q6. Feature ablation of extra features
* There is a significant improvement using all the features at **0.712** (from 0.605 without access to these features in Q5) Can we get a better score by removing some of the extra features and not using all of them, as some may not be useful/harmful to generalization? Try all combinations, only by using the first fold due to number of combinations (2 to the number of extra features).
* Always include context

In [36]:
if False: # takes some time as 2 ** num_extra_features - only set to True to do search
    combos = [list(p) for p in powerset(header[:-1])]  # always use context, just combination of others
    print("trying", len(combos), "combinations")
    best_f_score = 0  # initial best mean accuracy to beat

    results = []
    print("using preprocessing switches", preprocessing_switches)
    print("weight", _WEIGHT_)
    print("n", _N_)


    raw_data = []          # the filtered data from the dataset file
    # references to the data files
    data_file_path = 'fake_news.tsv'

    # Do the actual stuff (i.e. call the functions we've made)
    # We parse the dataset and put it in a raw data list
    print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
          "Preparing the dataset...",sep='\n')

    load_data(data_file_path) 

    # We split the raw dataset into a set of training data and a set of test data (80/20)
    # You do the cross validation on the 80% (training data)
    # We print the number of training samples and the number of features before the split
    print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
          "Preparing training and test data...",sep='\n')

    # try all different extra feature switches - 10**2 (1024 settings, will take some time!)
    for switches in combos:
        #preprocessing_switches = {k : False for k in preprocessing_switches.keys()}
        #for switch in switches:
        #    preprocessing_switches[switch] = True
        #print("*" * 30)
        extra_feature_switches = {k : False for k in header}
        for switch in switches:
            extra_feature_switches[switch] = True
        extra_feature_switches["context"] = True  # always use context
        print(extra_feature_switches)
        #print("*" * 30)


        #print(extra_feature_switches)
        # loading reviews
        # initialize global lists that will be appended to by the methods below
        #raw_data = []          # the filtered data from the dataset file
        train_data = []        # the pre-processed training data as a percentage of the total dataset
        test_data = []         # the pre-processed test data as a percentage of the total dataset


        split_and_preprocess_data(0.8)

        # let's look at the representation of the first instance of training:
        print(train_data[0])

        # We print the number of training samples and the number of features after the split
        print("After split, %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
              "Training Samples: ", len(train_data), "Features: ", len(global_feature_dict), sep='\n')


        all_scores = cross_validate(train_data, 10, first_fold_only=True)  # just do first fold
        f_score = all_scores[2]
        print(f_score)
        results.append([(k,v) for k,v in extra_feature_switches.items()] + all_scores)
        print("*" * 40)
        #plot_heat_map_similarity(df)
        if f_score >= best_f_score:
            best_f_score = f_score
            best_switches = switches

    # make the preprocessing switches the best one:
    extra_feature_switches = {k : False for k in extra_feature_switches.keys()}
    for switch in best_switches:
        extra_feature_switches[switch] = True
    extra_feature_switches['context'] = True # always use context
    print("*" * 50)
    print("best f-score", best_f_score)
    print("best combo", extra_feature_switches) 

In [20]:
# let's display all the results in a pandas dataframe
import pandas as pd
if False:  # set as False if above search not run
    results = sorted(results, key=lambda x:x[-1], reverse=True) # sort results from best to worst f-score
    df = pd.DataFrame([[x[1] for x in row[:-3]] + row[-3:] for row in results],
                  columns=[x[0] for x in results[0][:-3]] + ["p", "r", "f-score"])
    display(df)

Unnamed: 0,subject,speaker,speaker_job_title,state_info,party_affiliation,total_barely_true_counts,total_false_counts,total_half_true_counts,total_mostly_true_counts,total_pants_on_fire_counts,context,p,r,f-score
0,True,False,True,False,True,True,True,True,True,True,True,0.744939,0.746341,0.745351
1,True,False,False,False,True,True,True,True,True,True,True,0.742767,0.743902,0.743173
2,True,True,True,False,False,True,True,True,True,True,True,0.740950,0.742683,0.741319
3,True,False,True,True,False,True,True,True,True,True,True,0.740950,0.742683,0.741319
4,True,True,True,True,False,True,True,True,True,True,True,0.740019,0.741463,0.740454
5,True,False,False,True,True,True,True,True,True,True,True,0.739007,0.740244,0.739437
6,True,False,True,False,False,True,True,True,True,True,True,0.738596,0.740244,0.739015
7,True,True,True,False,True,True,True,True,True,True,True,0.738596,0.740244,0.739015
8,True,False,False,False,False,True,True,True,True,True,True,0.738025,0.739024,0.738413
9,True,False,True,True,True,True,True,True,True,True,True,0.736400,0.737805,0.736852


In [21]:
# which feature being true tends to help more?
if False:  # set as False if above search not run
    for key in extra_feature_switches.keys():
        print(key, sum(df[df[key]==True]['f-score']) / len(df[df[key]==True]))

subject 0.6809504867698621
speaker 0.6818461517997764
speaker_job_title 0.6810798327315704
state_info 0.6802716717037852
party_affiliation 0.6814256778007377
total_barely_true_counts 0.6896806148714493
total_false_counts 0.6962211080706227
total_half_true_counts 0.6896977875955431
total_mostly_true_counts 0.6946992152837231
total_pants_on_fire_counts 0.6843474185747227
context 0.6807127926057801


# Comments on extra features (for report)
* Using a single fold train/heldout set-up using the best settings found in Q5 showed that the best combination of features was using **all features except 'speaker' and 'state_info'**. As these are speaker identification features, it is perhaps unsurprising they are not as useful text features when encountering new users compared to more generalizable features.
* In this setting at least, all of the counts of previous real/fake news statements were in the best setting, showing the importance of previous behaviour. speaker_job_title and party_affiliation were also there, showing there are generalizations to be made about the type of user which are helpful.
* The **total_false_counts** is the most useful feature in combination with the others, followed by the **total_mostly_true_counts**, showing the utility of knowing **the speaker/user's previous behaviour with fake news**. All the counts of previous fake/real news behaviour could have been collapsed into just two sets of counts using the multiple label -> {FAKE, REAL} mapping we're using for classification, but not certain of the difference that would make. The speaker identification features (speaker, speaker_job_title, state_info_part_affiliation) and subject were overall less useful in combination with others than the count features.
* This result may not hold when using other features/weightings/hyper-parameter optimisations and may need to be re-done for a final global best model. Also, this was done on a single train/heldout fold so danger of overfitting.

In [29]:
# Best setting from search
extra_feature_switches = {'subject': True, 'speaker': False, 'speaker_job_title': True,
                          'state_info': False, 'party_affiliation': True, 'total_barely_true_counts': True,
                          'total_false_counts': True, 'total_half_true_counts': True,
                          'total_mostly_true_counts': True, 'total_pants_on_fire_counts': True, 'context': True}

In [30]:
# Repeating from Q5: Final hyperparameter tuning of the linearSVC on cross-val across the training data
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction import DictVectorizer
parameters = [{
#'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], 
'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1],
'max_iter': [1,5,10, 50, 100, 500, 1000, 5000]}]

clf = GridSearchCV(
        LinearSVC(), parameters, scoring='accuracy'
    )

clf.fit(DictVectorizer().fit_transform([x[0] for x in train_data]), [x[1] for x in train_data])



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='warn', n_jobs=None,
             param_grid=[{'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1],
                          'max_iter': [1, 5, 10, 50, 100, 500, 1000, 5000]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [31]:
print(clf.best_params_)

{'C': 0.005, 'max_iter': 10}


In [32]:
_C_ = clf.best_params_["C"]
_MAX_ITER_ = clf.best_params_["max_iter"]

In [71]:
def train_classifier(data):
    print("Training Classifier...")
    pipeline =  Pipeline([
     ('svc', LinearSVC(C=_C_, max_iter=_MAX_ITER_))])
    return SklearnClassifier(pipeline).train(data)

In [72]:
# QUESTION 3 - Make sure there is a function call here to the
# crossValidate function on the training set to get your results
print(preprocessing_switches)
print(extra_feature_switches)
print("weights", _WEIGHT_)
print("n", _N_)
# loading reviews
# initialize global lists that will be appended to by the methods below
raw_data = []          # the filtered data from the dataset file
train_data = []        # the pre-processed training data as a percentage of the total dataset
test_data = []         # the pre-processed test data as a percentage of the total dataset


# references to the data files
data_file_path = 'fake_news.tsv'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing the dataset...",sep='\n')

load_data(data_file_path) 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing training and test data...",sep='\n')

split_and_preprocess_data(0.8)

# let's look at the representation of the first instance of training:
print(train_data[0])

# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Training Samples: ", len(train_data), "Features: ", len(global_feature_dict), sep='\n')

cross_validate(train_data, 10)


{'convert_usernames': False, 'separate_out_punctuation': False, 'convert_number_words_to_digits': False, 'convert_numbers': False, 'remove_punctuation': True, 'convert_to_lowercase': False, 'remove_stopwords': True, 'apply_lemmatization': True}
{'subject': True, 'speaker': False, 'speaker_job_title': True, 'state_info': False, 'party_affiliation': True, 'total_barely_true_counts': True, 'total_false_counts': True, 'total_half_true_counts': True, 'total_mostly_true_counts': True, 'total_pants_on_fire_counts': True, 'context': True}
weights counts
n 4
Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 10241 rawData, 0 trainData, 0 testData
Preparing training and test data...
({'1@Says': 1, '1@Annies': 1, '1@List': 1, '1@political': 1, '1@group': 1, '1@support': 1, '1@thirdtrimester': 1, '1@abortion': 1, '1@demand': 1, '1@</s>': 1, '2@<s> Says': 1, '2@Says Annies': 1, '2@Annies List': 1, '2@List political': 1, '2@political group': 1, '2@group support': 1, '2@support third



Fold start on items 820 - 1640
Training Classifier...
Fold start on items 1640 - 2460
Training Classifier...
Fold start on items 2460 - 3280
Training Classifier...
Fold start on items 3280 - 4100
Training Classifier...
Fold start on items 4100 - 4920
Training Classifier...
Fold start on items 4920 - 5740
Training Classifier...
Fold start on items 5740 - 6560
Training Classifier...
Fold start on items 6560 - 7380
Training Classifier...
Fold start on items 7380 - 8200
Training Classifier...


[0.7284071503052327, 0.7293866394328967, 0.7270919540905876]

The system has improved to a **0.727** f-score (from 0.712) through optimized feature sets and also hyper-parameter tuning.

# Evaluate on test set

In [73]:
# Finally, check the accuracy of your classifier by training on all the tranin data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(test_data[0])   # have a look at the first test data instance
    classifier = train_classifier(train_data)  # train the classifier
    test_true = [t[1] for t in test_data]   # get the ground-truth labels from the data
    test_pred = predict_labels([x[0] for x in test_data], classifier)  # classify the test data to get predicted labels
    final_scores = precision_recall_fscore_support(test_true, test_pred, average='weighted') # evaluate
    report = classification_report(test_true, test_pred, output_dict=True)
    print(classification_report(test_true, test_pred))
    print("Done training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % final_scores[:3])

({'1@The': 1, '1@Bush': 1, '1@tax': 1, '1@cut': 1, '1@helped': 1, '1@create': 1, '1@substantial': 1, '1@part': 1, '1@deficit': 1, '1@</s>': 1, '2@<s> The': 1, '2@The Bush': 1, '2@Bush tax': 1, '2@tax cut': 1, '2@cut helped': 1, '2@helped create': 1, '2@create substantial': 1, '2@substantial part': 1, '2@part deficit': 1, '2@deficit </s>': 1, '3@<s> <s> The': 1, '3@<s> The Bush': 1, '3@The Bush tax': 1, '3@Bush tax cut': 1, '3@tax cut helped': 1, '3@cut helped create': 1, '3@helped create substantial': 1, '3@create substantial part': 1, '3@substantial part deficit': 1, '3@part deficit </s>': 1, '4@<s> <s> <s> The': 1, '4@<s> <s> The Bush': 1, '4@<s> The Bush tax': 1, '4@The Bush tax cut': 1, '4@Bush tax cut helped': 1, '4@tax cut helped create': 1, '4@cut helped create substantial': 1, '4@helped create substantial part': 1, '4@create substantial part deficit': 1, '4@substantial part deficit </s>': 1, 'subject': 'bush-administration,deficit,taxes', 'speaker_job_title': 'U.S. representati

