In [1]:
import csv
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import svm
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import time
from gensim.models import Word2Vec
from nltk.corpus import stopwords  
import nltk

import nltk
from nltk.stem.lancaster import LancasterStemmer
import sys
import unicodedata

# Read raw text 

In [2]:
train_raw_filepath =  "./2019S1-proj2-data_dos/train-raw.tsv"
dev_raw_filepath =  "./2019S1-proj2-data_dos/dev-raw.tsv"
test_raw_filepath =  "./2019S1-proj2-data_dos/test-raw.tsv"

In [3]:
# read tsv file into a 2D array
def read_tsv(filepath):
    label, text = [], []
    with open(filepath) as raw:
        reader = csv.reader(raw, delimiter="\t", quoting = csv.QUOTE_NONE)
        for row in reader:
            label.append(row[1])
            text.append(row[-1])
    return text, label

# Feature Engineering

In [4]:
def is_all_zero(instance):
    for attr in instance:
        if int(attr) != 0:
            return False
    return True

def remove_all_zero_instance(X, y):
    new_X, new_y = [], []
    for i in range(len(y)):
        instance = X[i]
        if is_all_zero(instance) is False:
            new_X.append(X[i])
            new_y.append(y[i])
    return new_X, new_y

## Text Pre-filtering  

In [5]:
def remove_URL(texts):
    for i in list(range(len(texts))):
        texts[i] = re.sub(r"http\S+", "", texts[i])
        
def remove_metion(texts):
    for i in list(range(len(texts))):
        texts[i] = re.sub(r"@\S+", "", texts[i])
        texts[i] = re.sub(r"@", "", texts[i])
        
def remove_hash(texts):
    for i in list(range(len(texts))):
        texts[i] = re.sub(r"#\S+", "", texts[i])
        texts[i] = re.sub(r"#", "", texts[i])

def remove_unicode(texts):
    for i in list(range(len(texts))):
        texts[i] = re.sub(r"\\u[a-zA-Z0-9][a-zA-Z0-9][a-zA-Z0-9][a-zA-Z0-9]", "", texts[i])


In [6]:
# a table structure to hold the different punctuation used
tbl = dict.fromkeys(i for i in range(sys.maxunicode)
                    if unicodedata.category(chr(i)).startswith('P'))
# method to remove punctuations from sentences.
def remove_punctuation(text):
    return text.translate(tbl)

## WHL calculating 

In [7]:
class Word_Counter:
    # Initializer / Instance Attributes
    def __init__(self, size):
        self.total_word_count = [0]*size
        self.Melbourne_word_count = [0]*size
        self.Sydney_word_count = [0]*size 
        self.Perth_word_count = [0]*size        
        self.Brisbane_word_count = [0]*size        
class WLH_lists:
    # Initializer / Instance Attributes
    def __init__(self):
        self.Melbourne_WLH_list = []
        self.Sydney_WLH_list = [] 
        self.Perth_WLH_list = []        
        self.Brisbane_WLH_list = []        

In [8]:
def word_frequency_counter(counts, lables):
    # create a list to store count
    num_words = counts.shape[1]
    counter = Word_Counter(num_words)
    
    for i in list(range(len(lables))):
        instance = counts[i, :]
        if (lables[i] == "Melbourne"):
            counter.Melbourne_word_count += instance.toarray().sum(axis=0)
        elif (lables[i] == "Sydney"):
            counter.Sydney_word_count  += instance.toarray().sum(axis=0)
        elif (lables[i] == "Perth"):
            counter.Perth_word_count += instance.toarray().sum(axis=0)
        elif (lables[i] == "Brisbane"):
            counter.Brisbane_word_count += instance.toarray().sum(axis=0)
    
    counter.total_word_count = counter.Melbourne_word_count + \
                                counter.Sydney_word_count +\
                                counter.Perth_word_count +\
                                counter.Brisbane_word_count
    return counter

In [9]:
def calculate_WHL_by_state(word_counter, vectoriser):
    WLH_list_occur_more_than_one_state = []
    WLH_list_one_state = []
    
    #  total number of word used for each word counter
    Melbourne_total_word_num = word_counter.Melbourne_word_count.sum()
    Brisbane_total_word_num = word_counter.Brisbane_word_count.sum()
    Sydney_total_word_num = word_counter.Sydney_word_count.sum()
    Perth_total_word_num = word_counter.Perth_word_count.sum()
    all_state_total_word_num = word_counter.total_word_count.sum()

    for i in list(range(len(word_counter.total_word_count))):    
    # for i in list(range(100)):
        curr_word_list = [] # [WHL, word, frequency,  state]
        curr_word = vectoriser.get_feature_names()[i]

        curr_word_total_prob = word_counter.total_word_count[i]/all_state_total_word_num

        WHL_Mel = (word_counter.Melbourne_word_count[i]/Melbourne_total_word_num)/curr_word_total_prob
        WHL_Syd = (word_counter.Sydney_word_count[i]/Sydney_total_word_num)/curr_word_total_prob
        WHL_Per = (word_counter.Perth_word_count[i]/Perth_total_word_num)/curr_word_total_prob
        WHL_Bri = (word_counter.Brisbane_word_count[i]/Brisbane_total_word_num)/curr_word_total_prob

        WHL_list = [WHL_Mel, WHL_Bri, WHL_Per, WHL_Syd]
        if (occur_in_n_state(WHL_list) > 0):
            max_WHL = max(WHL_list)

            state = [] # (stateName, wordStateCount, wordTotalCount)
            if (WHL_Mel == max_WHL):
                state.append(["Melbourne", word_counter.Melbourne_word_count[i], word_counter.Melbourne_word_count[i]/word_counter.total_word_count[i]])
            if (WHL_Bri == max_WHL):
                state.append(["Brisbane", word_counter.Brisbane_word_count[i], word_counter.Brisbane_word_count[i]/word_counter.total_word_count[i]])
            if (WHL_Per == max_WHL):
                state.append(["Perth", word_counter.Perth_word_count[i], word_counter.Perth_word_count[i]/word_counter.total_word_count[i]])
            if (WHL_Syd == max_WHL):
                state.append(["Sydney", word_counter.Sydney_word_count[i], word_counter.Sydney_word_count[i]/word_counter.total_word_count[i]])
            curr_word_list.append(max_WHL)
            curr_word_list.append(i)
            curr_word_list.append(curr_word)
            curr_word_list.append(state)
            WLH_list_occur_more_than_one_state.append(curr_word_list)
        else:
            max_WHL = max(WHL_list)

            state = [] # (stateName, wordStateCount, wordTotalCount)
            if (WHL_Mel == max_WHL):
                state.append(["Melbourne", word_counter.Melbourne_word_count[i], word_counter.Melbourne_word_count[i]/word_counter.total_word_count[i]])
            elif (WHL_Bri == max_WHL):
                state.append(["Brisbane", word_counter.Brisbane_word_count[i], word_counter.Brisbane_word_count[i]/word_counter.total_word_count[i]])
            elif (WHL_Per == max_WHL):
                state.append(["Perth", word_counter.Perth_word_count[i], word_counter.Perth_word_count[i]/word_counter.total_word_count[i]])
            elif (WHL_Syd == max_WHL):
                state.append(["Sydney", word_counter.Sydney_word_count[i], word_counter.Sydney_word_count[i]/word_counter.total_word_count[i]])
            curr_word_list.append(max_WHL)
            curr_word_list.append(i)
            curr_word_list.append(curr_word)
            curr_word_list.append(state)
            WLH_list_one_state.append(curr_word_list)
    return WLH_list_occur_more_than_one_state, WLH_list_one_state

In [10]:
def groupby_state(WLHs):
    WLH_groupby_state = WLH_lists()
        # Initializer / Instance Attributes

        
    for WLH in WLHs:
        state = WLH[3][0][0]
        if state == "Melbourne":
            WLH_groupby_state.Melbourne_WLH_list.append(WLH)
        elif state == "Sydney":
            WLH_groupby_state.Sydney_WLH_list.append(WLH)
        elif state == "Perth":
            WLH_groupby_state.Perth_WLH_list.append(WLH)            
        elif state == "Brisbane":
            WLH_groupby_state.Brisbane_WLH_list.append(WLH)
    sorted(WLH_groupby_state.Melbourne_WLH_list, key=lambda x: x[0], reverse=True)
    sorted(WLH_groupby_state.Sydney_WLH_list, key=lambda x: x[0], reverse=True)
    sorted(WLH_groupby_state.Perth_WLH_list, key=lambda x: x[0], reverse=True)
    sorted(WLH_groupby_state.Brisbane_WLH_list, key=lambda x: x[0], reverse=True)
    return WLH_groupby_state

In [11]:
def occur_in_n_state(WHL_list):
    count = 0
    for WHL in WHL_list:
        if WHL > 0:
            count += 1
    return count

## Extracting Top k WHL for each state 

In [12]:
def extract_top_k_index(state_WLH_list, k):
    indice = []
    WLH_top_k = (sorted(state_WLH_list, key=lambda x: x[0], reverse=True))[:k]
    
    for WLH in WLH_top_k:
        indice.append(WLH[1])
    return indice

In [13]:
# select cloumn from X_train by top 10 index for each state
def extract_column_by_index(matrix, indice, vectoriser):
    selected_words = []
    for index in indice:
        selected_words.append(vectoriser.get_feature_names()[index])
    
    result = []
    length = matrix.shape[0]
    for i in list(range(length)):
        row = matrix[i, :].toarray().sum(axis = 0)
        extract = []
        for index in indice:
            extract.append(row[index])
        result.append(extract)
    return result, selected_words
# feature engineering done

## Put all together => Feature Engineering Function

In [14]:
def preprocessing_test(filepath, selected_words):
    # read the file into two parts, text and its location
    X_raw, y = read_tsv(filepath)
    
    # remove URL, hash tag and metion
    remove_URL(X_raw)
    remove_hash(X_raw)
    remove_metion(X_raw)
    remove_unicode(X_raw)
    X_raw = [ remove_punctuation(text) for text in X_raw]
        
    # initilaze vectoriser
    vectoriser = CountVectorizer(stop_words="english", vocabulary=selected_words)
    X_sparse = vectoriser.fit_transform(X_raw)
    
    
    # make sparse matrix into 2D list
    X = []
    for i in list(range(len(y))):
        X.append(X_sparse[i, :].toarray().sum(axis = 0))
    return X, y

In [16]:
#This function takes in 
#@param: filepath => The filepath of the corpus
def preprocessing_train(filepath):
    # read the file into two parts, text and its location
    X_raw, y_train = read_tsv(filepath)
    
    # remove URL, hash tag and metion
    remove_URL(X_raw)
    remove_hash(X_raw)
    remove_metion(X_raw)
    remove_unicode(X_raw)
    X_raw = [ remove_punctuation(text) for text in X_raw]
    
#     # initialize the stemmer
#     stemmer = LancasterStemmer()
#     X_word_stems = []
#     for i in list(range(len(y_train))):
#         # stem and lower each word and remove duplicates
#         words = [stemmer.stem(w.lower()) for w in X_raw[i]]
#         X_word_stems.extend(words)
        
    # initilaze vectoriser
    vectoriser = CountVectorizer(stop_words="english", min_df = 15 )
    X_train = vectoriser.fit_transform(X_raw)
    
    # calculating WLH
    word_counter = word_frequency_counter(X_train, y_train)

    # count frequency for each word by state
    WLH_list_occur_more_than_one_state, WLH_list_one_state = calculate_WHL_by_state(word_counter, vectoriser)

    # group WLH by state
    WLH_lists_gourpby_state = groupby_state(WLH_list_occur_more_than_one_state)
    
    # select top k WHL
    num_features = len(vectoriser.get_feature_names())
    percent = 0.14
    k = percent*num_features/4
    k = int(k)
    k = 300
    
    # extract top 10 WLH index for each state
    top_k_indice_for_each_state = []
    top_k_indice_for_each_state.extend(extract_top_k_index(WLH_lists_gourpby_state.Brisbane_WLH_list, k))
    top_k_indice_for_each_state.extend(extract_top_k_index(WLH_lists_gourpby_state.Melbourne_WLH_list, k))
    top_k_indice_for_each_state.extend(extract_top_k_index(WLH_lists_gourpby_state.Perth_WLH_list, k))
    top_k_indice_for_each_state.extend(extract_top_k_index(WLH_lists_gourpby_state.Sydney_WLH_list, k))
    
    # extract columns
    new_train, selected_words = extract_column_by_index(X_train, top_k_indice_for_each_state, vectoriser)
    return new_train, y_train, selected_words

# Repeated Feature Selection

## Utility Functions

In [0]:
def text_preprocessing(filepath):
    # read the file into two parts, text and its location
    X_raw, y_train = read_tsv(filepath)
    
    # remove URL, hash tag and metion
    remove_URL(X_raw)
    remove_hash(X_raw)
    remove_metion(X_raw)
    remove_unicode(X_raw)
    X_raw = [ remove_punctuation(text) for text in X_raw]
    
    return X_raw, y_train

In [0]:
def text_to_train(X_text, y_train):
    # initilaze vectoriser
    vectoriser = CountVectorizer(stop_words="english", min_df = 10 )
    X_train = vectoriser.fit_transform(X_text)
    
    # calculating WLH
    word_counter = word_frequency_counter(X_train, y_train)

    # count frequency for each word by state
    WLH_list_occur_more_than_one_state, WLH_list_one_state = calculate_WHL_by_state(word_counter, vectoriser)

    # group WLH by state
    WLH_lists_gourpby_state = groupby_state(WLH_list_occur_more_than_one_state)
    
    
    
    # select top k WHL
    num_features = len(vectoriser.get_feature_names())
    percent = 0.14
    k = percent*num_features/4
    k = int(k)
    k = 200
    
    # extract top 10 WLH index for each state
    top_k_indice_for_each_state = []
    top_k_indice_for_each_state.extend(extract_top_k_index(WLH_lists_gourpby_state.Brisbane_WLH_list, k))
    top_k_indice_for_each_state.extend(extract_top_k_index(WLH_lists_gourpby_state.Melbourne_WLH_list, k))
    top_k_indice_for_each_state.extend(extract_top_k_index(WLH_lists_gourpby_state.Perth_WLH_list, k))
    top_k_indice_for_each_state.extend(extract_top_k_index(WLH_lists_gourpby_state.Sydney_WLH_list, k))
    
    # extract columns
    new_train, selected_words = extract_column_by_index(X_train, top_k_indice_for_each_state, vectoriser)
    
    return new_train, y_train, selected_words

In [0]:
def text_to_test(X_text, y_test, selected_words):
    # initilaze vectoriser
    vectoriser = CountVectorizer(stop_words="english", vocabulary=selected_words)
    X_test = vectoriser.fit_transform(X_text)
    
    new_test = []
    
    # Add index to X_test => [index, bag-of-word-model]
    for i in list(range(len(y_test))):
        new_test.append([i, X_test[i, :].toarray().sum(axis=0)])
    return new_test, y_test

In [0]:
def print_0_percet(X, y):
    num_instance = len(X)
    X_no_zero, y_no_zero = remove_all_zero_instance(X, y)

    num_all_zero_instance = num_instance - len(X_no_zero)
    print("The size of total dataset is %d"%num_instance)
    print("The size of all zero instance in this dataset is %d"%num_all_zero_instance)
    print("The ratio of all zero instances is %f "%(num_all_zero_instance/num_instance))

In [0]:
def extract_all_zero_train_text(X, X_text, y):
    X_text_all_zero = []
    y_all_zero = []
    X_not_all_zero = []
    y_not_all_zero = []
    
    for i in list(range(len(y))):
        if is_all_zero(X[i]):
            X_text_all_zero.append(X_text[i])
            y_all_zero.append(y[i])
        else:
            X_text_not_all_zero.append(X[i])
            y_not_all_zero.append(y[i])
    return X_text_all_zero, y_all_zero, X_not_all_zero, y_not_all_zero

In [0]:
def extract_all_zero_test_text(X, X_text, y):
    X_text_all_zero = []
    y_all_zero = []
    X_not_all_zero = []
    y_not_all_zero = []
    
    for i in list(range(len(y))):
        # append index to instance, thus index 1 is trainning instance
        if is_all_zero(X[i][1]):
            X_text_all_zero.append(X_text[i])
            y_all_zero.append(y[i])
        else:
            X_not_all_zero.append(X[i])
            y_not_all_zero.append(y[i])
    return X_text_all_zero, y_all_zero, X_not_all_zero, y_not_all_zero

## Training

In [0]:
X_train_text, y_train = text_preprocessing(dev_raw_filepath)

In [0]:
X_train, y_train, selected_words = text_to_train(X_train_text, y_train)

In [36]:
print(X_train[100])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [0]:
# split the dataset, extract all zero text
X_train_text, y_train, X_train_not_all_zero, y_train_not_all_zero = extract_all_zero_train_text(X_train, X_train_text, y_train)

In [48]:
# train classifier on X_train_1
classifier =  LogisticRegression()
classifier.fit(X_train_not_all_zero, y_train_not_all_zero)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
# save the classifier 1, selected words 1
system = [classifier, selected_words]
systems.append(system)

In [50]:
all_zero_instance_percent = len(y_train)/total_num_of_instance
print(all_zero_instance_percent)

0.7582814132580008


In [0]:
# text filtering
X_train_text, y_train = text_preprocessing(train_raw_filepath)
total_num_of_instance = len(y_train)
all_zero_instance_percent = 1
systems = []
#[(classfier_1, selected_words_1), (classfier_2, selected_words_2), .. ,(classfier_n, selected_words_n)]


In [0]:
# text to bag of words model based on WLH, call it X_train_1
X_train, y_train, selected_words = text_to_train(X_train_text, y_train)

In [125]:
len(X_train[0])

268

In [186]:
# text filtering
X_train_text, y_train = text_preprocessing(train_raw_filepath)
total_num_of_instance = len(y_train)
all_zero_instance_percent = 1
systems = []
#[(classfier_1, selected_words_1), (classfier_2, selected_words_2), .. ,(classfier_n, selected_words_n)]

while all_zero_instance_percent > 0.05:
    # text to bag of words model based on WLH, call it X_train_1
    X_train, y_train, selected_words = text_to_train(X_train_text, y_train)
    
    # split the dataset, extract all zero text
    X_train_text, y_train, X_train_not_all_zero, y_train_not_all_zero = extract_all_zero_train_text(X_train, X_train_text, y_train)
    
    
    # train classifier on X_train_1
    classifier =  LogisticRegression()
    classifier.fit(X_train_not_all_zero, y_train_not_all_zero)
    
    # save the classifier 1, selected words 1
    system = [classifier, selected_words]
    systems.append(system)
    
    
    all_zero_instance_percent = len(y_train)/total_num_of_instance
    print("###########################Current all 0 instance percentage: " + str(all_zero_instance_percent) + "##########################")

########################################################################
    
# text to bag of words model based on WLH, call it X_train_2

# extract all zero text

# train classifier on X_train_2

# save the classifier 2, selected words 2

# text to bag of words model based on WLH, call it X_train_3

# extract all zero text

# train classifier on X_train_3

# save the classifier 3, selected words 3



ValueError: ignored

In [142]:
(system[0].coef_).shape

(4, 268)

In [146]:
(systems[7][0].coef_).shape

(4, 268)

In [144]:
len(systems[7][1])

268

## Prediction on dev

In [0]:
def predict(filepath):
    # text filtering
    X_dev_text, y_dev = text_preprocessing(dev_raw_filepath)
    
    # prediction
    for system in systems:
        # text to bag of words model based on WLH, call it X_train_2

        # extract all zero text

        # train classifier on X_train_2
      

In [0]:
def predict_the_label(X_test, classifier, predict_labels):
    actual_labels = []
    indice = []
    for i in list(range(len(X_test))):
        label = classifier.predict([X_test[i][1]])
        index = X_test[i][0]
        predict_labels[index] = label
        indice.append(index)
    return predict_labels, indice

In [0]:
def is_all_label_stored_corrected(predict_labels, indice):
    none_zero_index = []
    for i in list(range(len(predict_labels))):
        if (predict_labels[i] != 0):
            none_zero_index.append(i)
    
    for i in none_zero_index:
        if i not in indice:
            return False
        
    for i in indice:
        if i not in none_zero_index:
            return False
    return True

In [0]:
def subsystem_accuracy(predict_labels, actual_labels, indice):
    n = len(indice)
    num_correct = 0
    for index in indice:
        if predict_labels[index] == actual_labels[index]:
            num_correct += 1
    return num_correct/n

In [0]:
# text filtering
X_dev_text, y_dev = text_preprocessing(dev_raw_filepath)
# prepare the list for prediction results
predict_labels = [0]*len(y_dev)

In [0]:
total_num_instance = len(X_dev_text)

In [0]:
# choose the first system
system = systems[0]
selected_word_0 = system[1]

In [148]:
len(selected_word_0)

400

In [0]:
# text to bag of words model based on WLH
X_dev, y_dev = text_to_test(X_dev_text, y_dev, selected_word_0)

37316

In [0]:
# extract all zero text
X_text_all_zero, y_all_zero, X_not_all_zero, y_not_all_zero =extract_all_zero_test_text(X_dev, X_dev_text, y_dev)

In [105]:
# print out number of all zero instance
print("Percent of all zero instance:"  +str(len(y_all_zero)/total_num_instance))
print("Number of all zero instance:"  +str(len(y_all_zero)))

Percent of all zero instance:0.875093793547004
Number of all zero instance:32655


In [151]:
classifier = system[0]
(classifier.coef_).shape

(4, 400)

In [0]:
#make prediction
predict_labels, indice = predict_the_label(X_not_all_zero, classifier, predict_labels)

In [178]:
# check whether store the label correctly
is_all_label_stored_corrected(predict_labels, indice)

  after removing the cwd from sys.path.


True

In [179]:
subsystem_accuracy(predict_labels, y_dev, indice)

0.6288350139455052

In [0]:
# choose the first system
system = systems[1]
selected_word_0 = system[1]

In [0]:
# text to bag of words model based on WLH
X_dev, y_dev = text_to_test(X_dev_text, y_dev, selected_word_0)

In [0]:
# print out accuracy of current system
X_dev_text, y_dev_fixed = text_preprocessing(dev_raw_filepath)

In [184]:
# text filtering
X_dev_text, y_dev = text_preprocessing(dev_raw_filepath)
# prepare the list for prediction results
predict_labels = [0]*len(y_dev)

for system in systems:
    classifier = system[0]
    selected_word = system[1]
    
    # text to bag of words model based on WLH
    X_dev, y_dev = text_to_test(X_dev_text, y_dev, selected_word)

    # extract all zero text
    X_dev_text, y_dev, X_not_all_zero, y_not_all_zero =extract_all_zero_test_text(X_dev, X_dev_text, y_dev)
    
    print("============================================================================")
    # print out number of all zero instance
    print("Percent of all zero instance:"  +str(len(y_dev)/total_num_instance))
    print("Number of all zero instance:"  +str(len(y_dev)))

    #make prediction
    predict_labels, indice = predict_the_label(X_not_all_zero, classifier, predict_labels)
    
    # check whether store the label correctly
    print("Is lable stored correctly:" + str(is_all_label_stored_corrected(predict_labels, indice)))
    
    # print subsystem accuracy
    print("The accuracy is: " + str(subsystem_accuracy(predict_labels, y_dev_fixed, indice)))

Percent of all zero instance:0.875093793547004
Number of all zero instance:32655


  after removing the cwd from sys.path.


Is lable stored correctly:True
The accuracy is: 0.6288350139455052
Percent of all zero instance:0.795503269375067
Number of all zero instance:29685


  after removing the cwd from sys.path.


Is lable stored correctly:False
The accuracy is: 0.2622895622895623
Percent of all zero instance:0.703799978561475
Number of all zero instance:26263


  after removing the cwd from sys.path.


Is lable stored correctly:False
The accuracy is: 0.24342489772063122
Percent of all zero instance:0.6002787008253833
Number of all zero instance:22400


  after removing the cwd from sys.path.


Is lable stored correctly:False
The accuracy is: 0.24851151954439554
Percent of all zero instance:0.4928448922714117
Number of all zero instance:18391


  after removing the cwd from sys.path.


Is lable stored correctly:False
The accuracy is: 0.2596657520578698
Percent of all zero instance:0.37935470039661273
Number of all zero instance:14156


  after removing the cwd from sys.path.


Is lable stored correctly:False
The accuracy is: 0.2538370720188902
Percent of all zero instance:0.254394897631043
Number of all zero instance:9493


  after removing the cwd from sys.path.


Is lable stored correctly:False
The accuracy is: 0.24812352562727857
Percent of all zero instance:0.14420087897952621
Number of all zero instance:5381


  after removing the cwd from sys.path.


Is lable stored correctly:False
The accuracy is: 0.24319066147859922


In [0]:
# print overall accuracy

## Evaluation

# Main 

In [0]:
X_train_text, y_train = text_preprocessing(train_raw_filepath)

In [17]:
X_train, y_train, selected_words = preprocessing_train(train_raw_filepath)

MemoryError: 

In [0]:
X_train, y_train = 1, 1

In [57]:
len(y_train)

103364

In [0]:
 X_text_all_zero, y_all_zero = extract_all_zero_train_text(X_train, X_train_text, y_train)

In [0]:
X_text_all_zero[100]

' Where my Geminis at'

In [35]:
print_0_percet(X_train, y_train)

The size of total dataset is 103364
The size of all zero instance in this dataset is 58839
The ratio of all zero instances is 0.569241 


In [0]:
X_dev, y_dev = preprocessing_test(dev_raw_filepath, selected_words)
print_0_percet(X_dev, y_dev)

In [50]:
X_test, y_test = preprocessing_test(test_raw_filepath, selected_words)
print_0_percet(X_test, y_test)

The size of total dataset is 108148
The size of all zero instance in this dataset is 60440
The ratio of all zero instances is 0.558864 


# Model Training & Evaluation 

## DNN 

In [0]:
import tensorflow as tf

ModuleNotFoundError: No module named 'tensorflow'

In [0]:
import tflearn

ModuleNotFoundError: No module named 'tflearn'

## Basic Classfiers 

In [0]:
models = [DummyClassifier(strategy='most_frequent'),
          GaussianNB(),
          MultinomialNB(),
          LogisticRegression()]
titles = ['Zero-R',
          'GNB',
          'MNB',
          'Logistic Regression']

In [38]:
# read csv file
# i = 1
# train_X, train_y = load_dataset(train_filepath[i])
X_dev_no_0, y_dev_no_0 = remove_all_zero_instance(X_dev, y_dev)
# dev_X, dev_y = load_dataset(dev_filepath[i])


# try each model without feature selection
for title, model in zip(titles, models):
    start = time.time()
    model.fit(X_train, y_train)
    acc = np.mean(cross_val_score(model, X_dev, y_dev, cv=10))
#     acc = np.mean(cross_val_score(model, X_train, y_train, cv=10))
    end = time.time()
    t = end - start
    print(title, acc, 'time:', t)

Zero-R 0.25 time: 0.3765854835510254
GNB 0.30922499528495 time: 41.52842974662781
MNB 0.3284113386601898 time: 25.651182174682617




Logistic Regression 0.3322177640082985 time: 29.649566650390625


## Stacking 

In [0]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import time
import time
import numpy as np

np.random.seed(1)

class StackingClassifier():

    def __init__(self, classifiers, metaclassifier):
        self.classifiers = classifiers
        self.metaclassifier = metaclassifier

    def fit(self, X, y):
        for clf in self.classifiers:
            clf.fit(X, y)
        X_meta = self._predict_base(X)
        self.metaclassifier.fit(X_meta, y)
    
    def _predict_base(self, X):
        yhats = []
        for clf in self.classifiers:
            yhat = clf.predict_proba(X)
            yhats.append(yhat)
        yhats = np.concatenate(yhats, axis=1)
        #print(yhats.shape)
        assert len(yhats) == len(X)
        return yhats
    
    def predict(self, X):
        X_meta = self._predict_base(X)
        yhat = self.metaclassifier.predict(X_meta)
        return yhat
    def score(self, X, y):
        yhat = self.predict(X)
        return accuracy_score(y, yhat)
    


classifiers = [LogisticRegression(),
#                 KNeighborsClassifier(),
                GaussianNB(),
                MultinomialNB(),
#                 DecisionTreeClassifier()
              ]
              
meta_classifier = DecisionTreeClassifier()
stacker = StackingClassifier(classifiers, meta_classifier)

def load_car_data(car_file):
    X = []
    y = []
    with open(car_file, mode='r') as fin:
        for line in fin:
            atts = line.strip().split(",")
            X.append(atts[:-1]) #all atts minus the last one
            y.append(atts[-1])
    onehot = OneHotEncoder()
    X = onehot.fit_transform(X).toarray()
    return X, y
# X, y = load_car_data('car.data')
# print('labels:', set(y))
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# stacker.fit(X_train, y_train)
# print('stacker acc:', stacker.score(X_test, y_test))

In [0]:
stacker.fit(X_train, y_train)



In [0]:
print('stacker acc on train:', stacker.score(X_train, y_train))

stacker acc on train: 0.3938895553577648


In [0]:
cv = 5
print('stacker cross-val acc  on train:', np.mean(cross_val_score(stacker.metaclassifier, X_train, y_train, cv=cv)))

stacker cross-val acc  on train: 0.37537234507695616


In [0]:
print('stacker acc on dev:', stacker.score(X_dev,y_dev))

stacker acc on dev: 0.31570907921534996


In [0]:
model = KNeighborsClassifier(n_neighbors=3, algorithm='auto', metric = "cosine")
model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='cosine',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [0]:
np.mean(cross_val_score(model, X_dev, y_dev, cv=10))

0.2890342420861846

In [46]:
model =  LogisticRegression()
model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [48]:
model_MNB = MultinomialNB()
model_MNB.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [0]:
predict_labels = model.predict(X_test)

In [0]:
predict_labels = model_MNB.predict(X_test)

In [0]:
def test_prediction_to_submit_file(test_lables, filepath):
    with open(filepath, 'w',  newline='') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerows([["Id", "Class"]])
        for i in list(range(len(test_lables))):
            index = '3' + str(i+1)
            writer.writerows([[index, test_lables[i]]])
        writeFile.close()

In [0]:
test_prediction_to_submit_file(predict_labels, "predicted_labels.csv")