In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import glob
import os
import random


# Code the TA's used to generate the train/test split, it is not necessary for you to run this unless you want to have
# a different test split then 0.25. You can download the data here from the following link:
# https://www.cs.drexel.edu/~mb553/drexel-amt-corpus.tar.gz
def create_train_test_split(test_split=0.25):
    dirs = glob.glob('Drexel-AMT-Corpus/*')
    dirs = [os.path.basename(x) for x in dirs if os.path.basename(x)]

    os.makedirs('data/')
    os.makedirs('data/test/')
    os.makedirs('data/train/')
    for dir in dirs:
        os.makedirs('data/test/%s/' % dir)
        os.makedirs('data/train/%s/' % dir)

        files = glob.glob('Drexel-AMT-Corpus/%s/*_[0-9].*' % dir)
        random.shuffle(files)
        N_test = int(test_split * float(len(files)))

        train_files = files[:-N_test]
        for train_file in train_files:
            os.rename(train_file, 'data/train/%s/%s' % (dir, os.path.basename(train_file)))

        test_files = files[-N_test:]
        for test_file in test_files:
            os.rename(test_file, 'data/test/%s/%s' % (dir, os.path.basename(test_file)))


if __name__ == '__main__':
    create_train_test_split()


In [5]:
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize, sent_tokenize
import nltk
import sklearn.datasets
import sklearn.metrics
import sklearn.model_selection
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import ngrams
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import state_union
nltk.download('averaged_perceptron_tagger')
from statistics import mode
from functools import reduce 
nltk.download('wordnet')
from nltk.probability import FreqDist
import re
from collections import Counter


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [6]:
ps = PorterStemmer()
wordnet = WordNetLemmatizer()

In [7]:
# Download the 'stopwords' and 'punkt' from the Natural Language Toolkit, you can comment the next lines if already present.
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [8]:
# Load the dataset into memory from the filesystem
def load_data(dir_name):
    return sklearn.datasets.load_files('/content/drive/MyDrive/Data/%s' % dir_name, encoding='utf-8')

def load_train_data():
    return load_data('train')


def load_test_data():
    return load_data('test')

In [9]:
# Function to extract ngrams
def extract_ngram_freqs(token_list, n):
    grams = list(ngrams(token_list, n))
    n_grams = []
    for word_tuple in grams:
        for word in word_tuple:
            if word not in stop_words:               
                n_grams.append(word)                       
    return n_grams

In [81]:
# Extract features from a given text
def extract_features(text):
    bag_of_words = [x for x in wordpunct_tokenize(text)]

    features = []
    # Example feature 1: count the number of words
    features.append(len(bag_of_words))

    # Example feature 2: count the number of words, excluded the stopwords
    features.append(len([x for x in bag_of_words if x.lower() not in stop_words]))

    # TODO: Follow the instructions in the assignment and add your own features.
    # Feature 3: count the number of unigrams
    unigrams = extract_ngram_freqs(bag_of_words, n=1)
    features.append(len(unigrams))

    # Feature 4: count the number of bigrmas
    bigrams = extract_ngram_freqs(bag_of_words, n=2)
    features.append(len(bigrams))

    # Feature 5: count the number of trigrams
    trigrams = extract_ngram_freqs(bag_of_words, n=3)
    features.append(len(trigrams))

    # # Feature 6: count the number of 5-grams
    # fgrams = extract_ngram_freqs(bag_of_words, n=5)
    # features.append(len(fgrams))

    # Feature 7: Average word length
    letterCount = 0 
    wordCount = 0
    for word in text.split():
        letterCount = letterCount + len(word)
        wordCount = wordCount + 1
    Avg_word_length = letterCount/wordCount
    features.append(Avg_word_length)

    # Feature 8: words count
    wordCount = 0
    for word in text.split():
        wordCount = wordCount + 1
    
    features.append(wordCount)
    
    # Feature 9: count nouns
    sentences = nltk.sent_tokenize(text)
    nouns = []
    for sentence in sentences:
     for word,pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))):
         if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS'):
             nouns.append(word)

    features.append(len(nouns))

    #Feature 10: Get special character count in text
    def get_special_char_count(text):
      count = 0
      special_characters = [';','+=','_','?','=','&','[',']','-',':']
      for each_letter in text:
        if each_letter in special_characters:
            count = count + 1
      return count

    features.append(get_special_char_count(text))

    # Feature 11: Get digit count in text
    def get_digit_count(text):
      return sum(c.isdigit() for c in text)
    
    features.append(get_digit_count(text))

    # Feature 12: Get length of text
    def get_text_length(text):
      return len(text)

    features.append(get_text_length(text))

    # Feature 13: average number of characters
    nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit
    filtered = [w for w in text if nonPunct.match(w)]
    counts = Counter(filtered)
    res = float(sum(map(len, filtered))) / len(filtered)
    features.append(res)  
 
    # Feature 14: Average number of words in a sentence
    sentences = nltk.tokenize.sent_tokenize(text)
    counts=[]
    for sentence in sentences:
      counts.append(len(sentence))

    def Average(list_counts): 
      avg = reduce(lambda x, y: x + y, list_counts) / len(list_counts)
      return avg

    features.append(Average(counts))

    # Feature 15: Average sentence length
    sentences = nltk.tokenize.sent_tokenize(text)
    avg_sen_len = sum(len(x.split()) for x in sentences) / len(sentences)
    features.append(avg_sen_len)  

    # Feature 16: total number of characters
    tot = []
    text = text.lower()
    myFD = nltk.FreqDist(text)
    for x in ":,.-[];!'\"\t\n/ ?":
      del myFD[x]
    total = float(sum(myFD.values()))
    features.append(total)

    # # Feature 17: character frequency
    # tot = []
    # text = text.lower()
    # myFD = nltk.FreqDist(text)
    # for x in ":,.-[];!'\"\t\n/ ?":
    #   del myFD[x]
    # for x in myFD:
    #   tot = myFD[x]
    # features.append(tot)

    return features


In [11]:
# Classify using the features
def classify(train_features, train_labels, test_features):
    # TODO: (Optional) If you would like to test different how classifiers would perform different, you can alter
    # TODO: the classifier here.
    clf = SVC(kernel='linear')
    clf.fit(train_features, train_labels)
    return clf.predict(test_features)

In [79]:
# Evaluate predictions (y_pred) given the ground truth (y_true)
def evaluate(y_true, y_pred):
    # TODO: What is being evaluated here and what does it say about the performance? Include or change the evaluation
    # TODO: if necessary.
    recall = sklearn.metrics.recall_score(y_true, y_pred, average='macro')
    print("Recall: %f" % recall)

    precision = sklearn.metrics.precision_score(y_true, y_pred, average='macro')
    print("Precision: %f" % precision)

    f1_score = sklearn.metrics.f1_score(y_true, y_pred, average='macro')
    print("F1-score: %f" % f1_score)

    return recall, precision, f1_score



In [82]:
# The main program
def main():
    train_data = load_train_data()

    # Extract the features
    features = list(map(extract_features, train_data.data))
    print(features)
    # Classify and evaluate
    skf = sklearn.model_selection.StratifiedKFold(n_splits=10)
    scores = []
    for fold_id, (train_indexes, validation_indexes) in enumerate(skf.split(train_data.filenames, train_data.target)):
        # Print the fold number
        print("Fold %d" % (fold_id + 1))

        # Collect the data for this train/validation split
        train_features = [features[x] for x in train_indexes]
        train_labels = [train_data.target[x] for x in train_indexes]
        validation_features = [features[x] for x in validation_indexes]
        validation_labels = [train_data.target[x] for x in validation_indexes]

        # Classify and add the scores to be able to average later
        y_pred = classify(train_features, train_labels, validation_features)
        scores.append(evaluate(validation_labels, y_pred))

        # Print a newline
        print("")

    # Print the averaged score
    recall = sum([x[0] for x in scores]) / len(scores)
    print("Averaged total recall", recall)
    precision = sum([x[1] for x in scores]) / len(scores)
    print("Averaged total precision", precision)
    f_score = sum([x[2] for x in scores]) / len(scores)
    print("Averaged total f-score", f_score)
    print("")

    # TODO: Once you are done crafting your features and tuning your model, also test on the test set and report your
    # TODO: findings. How does the score differ from the validation score? And why do you think this is?
    test_data = load_test_data()
    test_features = list(map(extract_features, test_data.data))
    
    y_pred = classify(features, train_data.target, test_features)
    evaluate(test_data.target, y_pred)

# This piece of code is common practice in Python, is something like if "this file" is the main file to be ran, then
# execute this remaining piece of code. The advantage of this is that your main loop will not be executed when you
# import certain functions in this file in another file, which is useful in larger projects.
if __name__ == '__main__':
    main()

[[614, 344, 361, 720, 1077, 4.702594810379241, 501, 145, 6, 8, 2865, 1.0, 135.14285714285714, 23.857142857142858, 2293.0], [574, 335, 351, 700, 1047, 4.845338983050848, 472, 149, 8, 8, 2789, 1.0, 130.85714285714286, 22.476190476190474, 2233.0], [568, 372, 382, 762, 1141, 5.349494949494949, 495, 168, 4, 71, 3173, 1.0, 164.47368421052633, 26.05263157894737, 2601.0], [547, 327, 340, 678, 1014, 5.127530364372469, 494, 181, 4, 13, 3051, 1.0, 150.4, 24.7, 2495.0], [586, 345, 366, 730, 1092, 4.705050505050505, 495, 142, 10, 5, 2825, 1.0, 103.5925925925926, 18.333333333333332, 2253.0], [556, 314, 329, 656, 981, 4.559748427672956, 477, 150, 6, 0, 2665, 1.0, 101.03846153846153, 18.346153846153847, 2133.0], [600, 323, 342, 682, 1021, 4.806706114398422, 507, 118, 5, 18, 2948, 1.0, 162.55555555555554, 28.166666666666668, 2352.0], [652, 384, 399, 796, 1191, 5.161943319838056, 494, 233, 3, 2, 3049, 1.0, 151.25, 24.7, 2489.0], [565, 304, 327, 652, 976, 4.641237113402062, 485, 122, 1, 0, 2738, 1.0, 108

  _warn_prf(average, modifier, msg_start, len(result))


Recall: 0.377778
Precision: 0.270000
F1-score: 0.294180

Fold 3


  _warn_prf(average, modifier, msg_start, len(result))


Recall: 0.455556
Precision: 0.368519
F1-score: 0.380000

Fold 4


  _warn_prf(average, modifier, msg_start, len(result))


Recall: 0.400000
Precision: 0.325926
F1-score: 0.336296

Fold 5


  _warn_prf(average, modifier, msg_start, len(result))


Recall: 0.466667
Precision: 0.405556
F1-score: 0.417037

Fold 6


  _warn_prf(average, modifier, msg_start, len(result))


Recall: 0.366667
Precision: 0.289630
F1-score: 0.309312

Fold 7


  _warn_prf(average, modifier, msg_start, len(result))


Recall: 0.477778
Precision: 0.407407
F1-score: 0.423704

Fold 8


  _warn_prf(average, modifier, msg_start, len(result))


Recall: 0.411111
Precision: 0.335185
F1-score: 0.346667

Fold 9


  _warn_prf(average, modifier, msg_start, len(result))


Recall: 0.377778
Precision: 0.307407
F1-score: 0.321481

Fold 10


  _warn_prf(average, modifier, msg_start, len(result))


Recall: 0.455556
Precision: 0.362963
F1-score: 0.378519

Averaged total recall 0.4066666666666666
Averaged total precision 0.32799999999999996
Averaged total f-score 0.3432380952380952



  _warn_prf(average, modifier, msg_start, len(result))


Recall: 0.420370
Precision: 0.452196
F1-score: 0.406537


  _warn_prf(average, modifier, msg_start, len(result))
