# Question 5: Optimising pre-processing and feature extraction (30 marks)

In [171]:
import csv, nltk                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support, accuracy_score # to report on precision and recall
import numpy as np

import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/abdulrahmanqadi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abdulrahmanqadi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/abdulrahmanqadi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [182]:
def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            (label, text) = parse_data_line(line)
            raw_data.append((text, label))

def split_and_preprocess_data(percentage):
    # Modified to use new pre_process and to_feature_vector functions
    num_samples = len(raw_data)
    num_training_samples = int(percentage * num_samples)
    for (text, label) in raw_data[:num_training_samples]:
        train_data.append((to_feature_vector(pre_process(text)), label))
    for (text, label) in raw_data[num_training_samples:]:
        test_data.append((to_feature_vector(pre_process(text)), label))

In [183]:
def parse_data_line(data_line):
    # Should return a tuple of the label as just positive or negative and the statement
    # e.g. (label, statement)
    label = data_line[1] # sentiment label in the second column
    statement = data_line[2] # Tweet in the third column
    return (label, statement)

In [184]:
def pre_process(text):
    # Converting text to lowercase (Normalisation)
    text = text.lower()
    # Tokenising the text (Tokenisation)
    tokens = word_tokenize(text)
    # Removing punctuation and stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
    # Lemmatising the words (Lemmatisation)
    lemmatiser = WordNetLemmatizer()
    tokens = [lemmatiser.lemmatize(word) for word in tokens]
    return tokens

In [185]:
global_feature_dict = {}  # Global dictionary of features

def to_feature_vector(tokens):
    feature_vector = {}
    
    # Generating trigrams
    n = 3
    bigrams = nltk.ngrams(tokens, n)
    all_tokens = tokens + [' '.join(bigram) for bigram in bigrams]
    
    # Calculating TF for each token
    for token in all_tokens:
        if token in feature_vector:
            feature_vector[token] += 1
        else:
            feature_vector[token] = 1
    
    # Updating global feature dictionary
    for token in feature_vector:
        if token not in global_feature_dict:
            global_feature_dict[token] = 1
        else:
            global_feature_dict[token] += 1


    return feature_vector

In [186]:
# TRAINING AND VALIDATING OUR CLASSIFIER

def train_classifier(data):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(data)

In [187]:
#solution
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score


def cross_validate(dataset, folds):
    # Initialising variables to store the sum of metrics across all folds
    total_precision = 0
    total_recall = 0
    total_f1_score = 0
    total_accuracy = 0

    fold_size = int(len(dataset)/folds) + 1
    
    for i in range(0,len(dataset),int(fold_size)):
        # inserting code here that trains and tests on the 10 folds of data in the dataset
        print("Fold start on items %d - %d" % (i, i+fold_size))
        # FILL IN THE METHOD HERE
        
        # Splitting the dataset into training and testing sets for this fold
        train_data = dataset[:i] + dataset[i + fold_size:]
        test_data = dataset[i:i + fold_size]
        
        # Splitting the data and labels for training and testing sets
        train_data, train_labels = zip(*train_data)
        test_data, test_labels = zip(*test_data)
        
        # Training the classifier
        classifier = train_classifier(list(zip(train_data, train_labels)))
        
        # Predicting labels on the test set
        predicted_labels = predict_labels(test_data, classifier)
        
        # Computing precision, recall, f1_score, and accuracy for this fold
        precision, recall, f1, _ = precision_recall_fscore_support(test_labels, predicted_labels, average='weighted')
        accuracy = accuracy_score(test_labels, predicted_labels)
        
        # Summing the metrics for averaging later
        total_precision += precision
        total_recall += recall
        total_f1_score += f1
        total_accuracy += accuracy
        
    # Averaging the metrics over all folds
    avg_precision = total_precision / folds
    avg_recall = total_recall / folds
    avg_f1_score = total_f1_score / folds
    avg_accuracy = total_accuracy / folds
        
    # Creating a dictionary to hold the average metrics
    cv_results = {
        'precision': avg_precision,
        'recall': avg_recall,
        'f1_score': avg_f1_score,
        'accuracy': avg_accuracy
    }
    
    return cv_results

In [188]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predict_labels(samples, classifier):
    """Assuming preprocessed samples, return their predicted labels from the classifier model."""
    return classifier.classify_many(samples)

def predict_label_from_raw(sample, classifier):
    """Assuming raw text, return its predicted label from the classifier model."""
    return classifier.classify(to_feature_vector(preProcess(reviewSample)))

In [189]:
# MAIN

# loading reviews
# initialize global lists that will be appended to by the methods below
raw_data = []          # the filtered data from the dataset file
train_data = []        # the pre-processed training data as a percentage of the total dataset
test_data = []         # the pre-processed test data as a percentage of the total dataset


# references to the data files
data_file_path = 'sentiment-dataset.tsv'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing the dataset...",sep='\n')

load_data(data_file_path) 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing training and test data...",sep='\n')

split_and_preprocess_data(0.8)

# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Training Samples: ", len(train_data), "Features: ", len(global_feature_dict), sep='\n')


Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 33540 rawData, 0 trainData, 0 testData
Preparing training and test data...
After split, 33540 rawData, 26832 trainData, 6708 testData
Training Samples: 
26832
Features: 
402838


In [190]:
cross_validate(train_data, 10)

Fold start on items 0 - 2684
Training Classifier...
Fold start on items 2684 - 5368
Training Classifier...
Fold start on items 5368 - 8052
Training Classifier...
Fold start on items 8052 - 10736
Training Classifier...
Fold start on items 10736 - 13420
Training Classifier...
Fold start on items 13420 - 16104
Training Classifier...
Fold start on items 16104 - 18788
Training Classifier...
Fold start on items 18788 - 21472
Training Classifier...
Fold start on items 21472 - 24156
Training Classifier...
Fold start on items 24156 - 26840
Training Classifier...


{'precision': 0.8598017186188786,
 'recall': 0.8612860576655328,
 'f1_score': 0.8589535494490322,
 'accuracy': 0.8612860576655328}

In [191]:
# Finally, check the accuracy of your classifier by training on all the traning data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(test_data[0])   # have a look at the first test data instance
    classifier = train_classifier(train_data)  # train the classifier
    test_true = [t[1] for t in test_data]   # get the ground-truth labels from the data
    test_pred = predict_labels([x[0] for x in test_data], classifier)  # classify the test data to get predicted labels
    final_scores = precision_recall_fscore_support(test_true, test_pred, average='weighted') # evaluate
    print("Done training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % final_scores[:3])

({'tomorrow': 1, "'ll": 2, 'release': 1, '58th': 1, 'episode': 1, 'hsonair': 1, 'profiling': 1, 'alissadossantos': 1, 'talk': 1, 'storytelling': 1, 'beyonce': 1, "tomorrow 'll release": 1, "'ll release 58th": 1, 'release 58th episode': 1, '58th episode hsonair': 1, 'episode hsonair profiling': 1, 'hsonair profiling alissadossantos': 1, "profiling alissadossantos 'll": 1, "alissadossantos 'll talk": 1, "'ll talk storytelling": 1, 'talk storytelling beyonce': 1}, 'positive')
Training Classifier...
Done training!
Precision: 0.857849
Recall: 0.859273
F Score:0.857207
