In [1]:
import csv
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import svm
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import time
from gensim.models import Word2Vec
from nltk.corpus import stopwords  
import nltk
nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91260\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91260\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Read raw text 

In [2]:
train_raw_filepath =  "./2019S1-proj2-data_dos/train-raw.tsv"
dev_raw_filepath =  "./2019S1-proj2-data_dos/dev-raw.tsv"
test_raw_filepath =  "./2019S1-proj2-data_dos/test-raw.tsv"

In [3]:
# read tsv file into a 2D array
def read_tsv(filepath):
    label, text = [], []
    with open(filepath) as raw:
        reader = csv.reader(raw, delimiter="\t", quoting = csv.QUOTE_NONE)
        for row in reader:
            label.append(row[1])
            text.append(row[-1])
    return text, label

# Feature Engineering

## Utility

In [4]:
def is_all_zero(instance):
    for attr in instance:
        if float(attr) != 0.0:
            return False
    return True

def remove_all_zero_instance(X, y):
    new_X, new_y = [], []
    for i in range(len(y)):
        instance = X[i]
        if is_all_zero(instance) is False:
            new_X.append(X[i])
            new_y.append(y[i])
    return new_X, new_y

## Text Pre-filtering  

In [5]:
def remove_URL(texts):
    for i in list(range(len(texts))):
        texts[i] = re.sub(r"http\S+", "", texts[i])
        
def remove_metion(texts):
    for i in list(range(len(texts))):
        texts[i] = re.sub(r"@\S+", "", texts[i])
        texts[i] = re.sub(r"@", "", texts[i])
        
def remove_hash(texts):
    for i in list(range(len(texts))):
        texts[i] = re.sub(r"#\S+", "", texts[i])

## WHL calculating 

In [6]:
class Word_Counter:
    # Initializer / Instance Attributes
    def __init__(self, size):
        self.total_word_count = [0]*size
        self.Melbourne_word_count = [0]*size
        self.Sydney_word_count = [0]*size 
        self.Perth_word_count = [0]*size        
        self.Brisbane_word_count = [0]*size        
class WLH_lists:
    # Initializer / Instance Attributes
    def __init__(self):
        self.Melbourne_WLH_list = []
        self.Sydney_WLH_list = [] 
        self.Perth_WLH_list = []        
        self.Brisbane_WLH_list = []        

In [7]:
def word_frequency_counter(counts, lables):
    # create a list to store count
    num_words = counts.shape[1]
    counter = Word_Counter(num_words)
    
    for i in list(range(len(lables))):
        instance = counts[i, :]
        if (lables[i] == "Melbourne"):
            counter.Melbourne_word_count += instance.toarray().sum(axis=0)
        elif (lables[i] == "Sydney"):
            counter.Sydney_word_count  += instance.toarray().sum(axis=0)
        elif (lables[i] == "Perth"):
            counter.Perth_word_count += instance.toarray().sum(axis=0)
        elif (lables[i] == "Brisbane"):
            counter.Brisbane_word_count += instance.toarray().sum(axis=0)
    
    counter.total_word_count = counter.Melbourne_word_count + \
                                counter.Sydney_word_count +\
                                counter.Perth_word_count +\
                                counter.Brisbane_word_count
    return counter

In [8]:
def occur_in_n_state(WHL_list):
    count = 0
    for WHL in WHL_list:
        if WHL > 0:
            count += 1
    return count

In [24]:
def calculate_WHL_by_state(word_counter, vectoriser):
    WLH_list_occur_more_than_one_state = []
    WLH_list_one_state = []
    
    #  total number of word used for each word counter
    Melbourne_total_word_num = word_counter.Melbourne_word_count.sum()
    Brisbane_total_word_num = word_counter.Brisbane_word_count.sum()
    Sydney_total_word_num = word_counter.Sydney_word_count.sum()
    Perth_total_word_num = word_counter.Perth_word_count.sum()
    all_state_total_word_num = word_counter.total_word_count.sum()

    for i in list(range(len(word_counter.total_word_count))):    
    # for i in list(range(100)):
        curr_word_list = [] # [WHL, word, frequency,  state]
        curr_word = vectoriser.get_feature_names()[i]

        curr_word_total_prob = word_counter.total_word_count[i]/all_state_total_word_num

        WHL_Mel = (word_counter.Melbourne_word_count[i]/Melbourne_total_word_num)/curr_word_total_prob
        WHL_Syd = (word_counter.Sydney_word_count[i]/Sydney_total_word_num)/curr_word_total_prob
        WHL_Per = (word_counter.Perth_word_count[i]/Perth_total_word_num)/curr_word_total_prob
        WHL_Bri = (word_counter.Brisbane_word_count[i]/Brisbane_total_word_num)/curr_word_total_prob

        WHL_list = [WHL_Mel, WHL_Bri, WHL_Per, WHL_Syd]
        if (occur_in_n_state(WHL_list) > 0):
            max_WHL = max(WHL_list)

            state = [] # (stateName, wordStateCount, wordTotalCount)
            if (WHL_Mel == max_WHL):
                state.append(["Melbourne", word_counter.Melbourne_word_count[i], word_counter.Melbourne_word_count[i]/word_counter.total_word_count[i]])
            if (WHL_Bri == max_WHL):
                state.append(["Brisbane", word_counter.Brisbane_word_count[i], word_counter.Brisbane_word_count[i]/word_counter.total_word_count[i]])
            if (WHL_Per == max_WHL):
                state.append(["Perth", word_counter.Perth_word_count[i], word_counter.Perth_word_count[i]/word_counter.total_word_count[i]])
            if (WHL_Syd == max_WHL):
                state.append(["Sydney", word_counter.Sydney_word_count[i], word_counter.Sydney_word_count[i]/word_counter.total_word_count[i]])
            curr_word_list.append(max_WHL)
            curr_word_list.append(i)
            curr_word_list.append(curr_word)
            curr_word_list.append(state)
            WLH_list_occur_more_than_one_state.append(curr_word_list)
        else:
            max_WHL = max(WHL_list)

            state = [] # (stateName, wordStateCount, wordTotalCount)
            if (WHL_Mel == max_WHL):
                state.append(["Melbourne", word_counter.Melbourne_word_count[i], word_counter.Melbourne_word_count[i]/word_counter.total_word_count[i]])
            elif (WHL_Bri == max_WHL):
                state.append(["Brisbane", word_counter.Brisbane_word_count[i], word_counter.Brisbane_word_count[i]/word_counter.total_word_count[i]])
            elif (WHL_Per == max_WHL):
                state.append(["Perth", word_counter.Perth_word_count[i], word_counter.Perth_word_count[i]/word_counter.total_word_count[i]])
            elif (WHL_Syd == max_WHL):
                state.append(["Sydney", word_counter.Sydney_word_count[i], word_counter.Sydney_word_count[i]/word_counter.total_word_count[i]])
            curr_word_list.append(max_WHL)
            curr_word_list.append(i)
            curr_word_list.append(curr_word)
            curr_word_list.append(state)
            WLH_list_one_state.append(curr_word_list)
    return WLH_list_occur_more_than_one_state, WLH_list_one_state

In [25]:
def groupby_state(WLHs):
    WLH_groupby_state = WLH_lists()
        # Initializer / Instance Attributes

        
    for WLH in WLHs:
        state = WLH[3][0][0]
        if state == "Melbourne":
            WLH_groupby_state.Melbourne_WLH_list.append(WLH)
        elif state == "Sydney":
            WLH_groupby_state.Sydney_WLH_list.append(WLH)
        elif state == "Perth":
            WLH_groupby_state.Perth_WLH_list.append(WLH)            
        elif state == "Brisbane":
            WLH_groupby_state.Brisbane_WLH_list.append(WLH)
    sorted(WLH_groupby_state.Melbourne_WLH_list, key=lambda x: x[0], reverse=True)
    sorted(WLH_groupby_state.Sydney_WLH_list, key=lambda x: x[0], reverse=True)
    sorted(WLH_groupby_state.Perth_WLH_list, key=lambda x: x[0], reverse=True)
    sorted(WLH_groupby_state.Brisbane_WLH_list, key=lambda x: x[0], reverse=True)
    return WLH_groupby_state

## Extracting Top k WHL for each state 

In [26]:
def extract_top_k_index(state_WLH_list, k):
    indice = []
    WLH_top_k = (sorted(state_WLH_list, key=lambda x: x[0], reverse=True))[:k]
    
    for WLH in WLH_top_k:
        indice.append(WLH[1])
    return indice

In [27]:
# select cloumn from X_train by top 10 index for each state
def extract_column_by_index(matrix, indice, vectoriser):
    selected_words = []
    for index in indice:
        selected_words.append(vectoriser.get_feature_names()[index])
    
    result = []
    length = matrix.shape[0]
    for i in list(range(length)):
        row = matrix[i, :].toarray().sum(axis = 0)
        extract = []
        for index in indice:
            extract.append(row[index])
        result.append(extract)
    return result, selected_words
# feature engineering done

## Word2Vec 

In [28]:
# This function take the filepath of the text and return a list of preprocessed corpus
def preprocess_corpus(filepath):
    # read the file into two parts, text and its location
    X_raw, y_train = read_tsv(filepath)

    # remove URL, hash tag and metion
    remove_URL(X_raw)
    remove_hash(X_raw)
    remove_metion(X_raw)

    # remian only letters and numbers
    processed_tweets = [re.sub('[^a-zA-Z-0-9]', ' ', tweet) for tweet in X_raw]

    # make it into one string and out it into a list => the input format
    processed_tweets_in_one = ""
    for tweet in processed_tweets:
        processed_tweets_in_one += tweet
    processed_tweets_in_one = processed_tweets_in_one.lower()
    processed_tweets_in_one = [processed_tweets_in_one]

    # extract each word and make then into 1 list
    all_words = [nltk.word_tokenize(tweet) for tweet in processed_tweets_in_one]

    # Removing Stop Words
    for i in range(len(all_words)):  
        all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]
    
    return all_words

## Put all together => Feature Engineering Function

In [29]:
#This function takes in 
#@param: filepath => The filepath of the corpus
def get_top_k_index(filepath):
    # read the file into two parts, text and its location
    X_raw, y_train = read_tsv(filepath)
    
    # remove URL, hash tag and metion
    remove_URL(X_raw)
    remove_hash(X_raw)
    remove_metion(X_raw)
    
    # initilaze vectoriser
    vectoriser = CountVectorizer(stop_words="english", min_df = 10 )
    X_train = vectoriser.fit_transform(X_raw)
    
    # calculating WLH
    word_counter = word_frequency_counter(X_train, y_train)

    # count frequency for each word by state
    WLH_list_occur_more_than_one_state, WLH_list_one_state = calculate_WHL_by_state(word_counter, vectoriser)

    # group WLH by state
    WLH_lists_gourpby_state = groupby_state(WLH_list_occur_more_than_one_state)
    
    # select top k WHL
#     num_features = len(vectoriser.get_feature_names())
#     percent = 0.14
#     k = percent*num_features/4
#     k = int(k)
    k = 30
    
    # extract top 10 WLH index for each state
    top_k_indice_for_each_state = []
    top_k_indice_for_each_state.extend(extract_top_k_index(WLH_lists_gourpby_state.Brisbane_WLH_list, k))
    top_k_indice_for_each_state.extend(extract_top_k_index(WLH_lists_gourpby_state.Melbourne_WLH_list, k))
    top_k_indice_for_each_state.extend(extract_top_k_index(WLH_lists_gourpby_state.Perth_WLH_list, k))
    top_k_indice_for_each_state.extend(extract_top_k_index(WLH_lists_gourpby_state.Sydney_WLH_list, k))
    
    return top_k_indice_for_each_state, vectoriser

In [30]:
# select cloumn from X_train by top 10 index for each state
def get_selected_words(indice, vectoriser):
    return [vectoriser.get_feature_names()[index] for index in indice]

In [31]:
# input a sting of tweet, return a list of words cotain in this tweet, remove metion, hash tags and URL,
# only include letters and  numbers.
def preprocess_single_tweet(tweet):
    tweet = re.sub(r"http\S+", "", tweet)
    tweet = re.sub(r"@\S+", "", tweet)
    tweet = re.sub(r"@", "", tweet)
    tweet = re.sub(r"#\S+", "", tweet)
    tweet = re.sub(r"#", "", tweet)
    tweet = re.sub('[^a-zA-Z-0-9]', ' ', tweet)
    tweet = tweet.lower()
    tweet = nltk.word_tokenize(tweet)
    tweet =  [w for w in tweet if w not in stopwords.words('english')]
    
    return tweet

In [32]:
def raw_to_train(filepath,selected_words):
    # read the file into two parts, text and its location
    X_raw, y_train = read_tsv(filepath)
    
    X_train = []
    for tweet in X_raw:
        # preprocess tweet, remove hash tage, metion, url and only maintain letters and numbers
        text = preprocess_single_tweet(tweet)
        
        #convert tweet from text into trainning instance
#         X_train.append(words_to_train_instance_sum_all(selected_words, text))
#         X_train.append(words_to_train_instance_sum_top_k(selected_words, text, 1))
        X_train.append(words_to_train_instance_sum_larger_than_k(selected_words, text, 0.2))
    return X_train, y_train

In [33]:
def print_0_percet(X, y):
    num_instance = len(X)
    X_no_zero, y_no_zero = remove_all_zero_instance(X, y)

    num_all_zero_instance = num_instance - len(X_no_zero)
    print("The size of total dataset is %d"%num_instance)
    print("The size of all zero instance in this dataset is %d"%num_all_zero_instance)
    print("The ratio of all zero instances is %f "%(num_all_zero_instance/num_instance))

## How to use Word2Vec and WLH to extract information 

#### Strategies 1: for each word sum similarity of all words in tweet 

In [34]:
def words_to_train_instance_sum_all(selected_words, tweet):
    single_instance = [0]*len(selected_words)
    for i in list(range(len(selected_words))):
        score = 0
        selected_word = selected_words[i]
        for word in tweet:
            if (word2vec.wv.vocab.get(word)!=None):
                score += word2vec.wv.similarity(word, selected_word)
        single_instance[i] = score
    return single_instance

#### Strategies 2: for each word sum top k similarity of all words in tweet 

In [35]:
def words_to_train_instance_sum_top_k(selected_words, tweet, k):
    single_instance = [0]*len(selected_words)
    for i in list(range(len(selected_words))):
        score = []
        selected_word = selected_words[i]
        for word in tweet:
            if (word2vec.wv.vocab.get(word)!=None):
                score.append(word2vec.wv.similarity(word, selected_word))
        score.sort(reverse=True)
        single_instance[i] = sum(score[:k])
    return single_instance

#### Strategies 3: for each word add > k

In [36]:
def words_to_train_instance_sum_larger_than_k(selected_words, tweet, k):
    single_instance = [0]*len(selected_words)
    for i in list(range(len(selected_words))):
        score = []
        selected_word = selected_words[i]
        for word in tweet:
            if (word2vec.wv.vocab.get(word)!=None):
                if (word2vec.wv.similarity(word, selected_word) > k):
                    score.append(word2vec.wv.similarity(word, selected_word))
        score.sort(reverse=True)
        single_instance[i] = sum(score)
    return single_instance

# Model Training and Evaluation

In [37]:
# train_corpus = preprocess_corpus(train_raw_filepath)
# dev_corpus = preprocess_corpus(dev_raw_filepath)
# test_corpus = preprocess_corpus(test_raw_filepath)

In [38]:
# word2vec = Word2Vec([train_corpus[0],dev_corpus[0], test_corpus[0]] , min_count=2) 
# word2vec.save("word2vec.model")

In [39]:
word2vec = Word2Vec.load("word2vec.model")

In [40]:
len(word2vec.wv.vocab.keys())

51242

In [41]:
top_k_indice, vectoriser = get_top_k_index(train_raw_filepath)
selected_words = get_selected_words(top_k_indice, vectoriser)

In [53]:
train_small_filepath = "./train_raw_small.csv"
dev_small_filepath = "./dev_raw_small.csv"

In [42]:
X_train, y_train = raw_to_train(train_raw_filepath, selected_words)

In [74]:
len(X_train)

103364

## Print percetage of all 0 instances 

In [43]:
print_0_percet(X_train, y_train)

The size of total dataset is 103364
The size of all zero instance in this dataset is 6874
The ratio of all zero instances is 0.066503 


In [66]:
X_dev, y_dev = raw_to_train(dev_small_filepath, selected_words)
print_0_percet(X_dev, y_dev)

The size of total dataset is 2550
The size of all zero instance in this dataset is 118
The ratio of all zero instances is 0.046275 


In [44]:
X_test, y_test = raw_to_train(test_raw_filepath, selected_words)
print_0_percet(X_test, y_test)

The size of total dataset is 108148
The size of all zero instance in this dataset is 6868
The ratio of all zero instances is 0.063506 


In [45]:
models = [DummyClassifier(strategy='most_frequent'),
          KNeighborsClassifier(n_neighbors=2, algorithm='auto', metric = "cosine"),
          LogisticRegression()]
titles = ['Zero-R',
          "K-NN with cosine similarity",
          'Logistic Regression']

In [46]:
# read csv file
# i = 1
# train_X, train_y = load_dataset(train_filepath[i])
# X_dev_no_0, y_dev_no_0 = remove_all_zero_instance(X_dev, y_dev)
# dev_X, dev_y = load_dataset(dev_filepath[i])


# try each model without feature selection
for title, model in zip(titles, models):
    start = time.time()
#     model.fit(X_train, y_train)
#     acc = np.mean(cross_val_score(model, X_dev, y_dev, cv=10))
    acc = np.mean(cross_val_score(model, X_train, y_train, cv=10))
    end = time.time()
    t = end - start
    print(title, acc, 'time:', t)

Zero-R 0.25 time: 10.085243940353394
K-NN with cosine similarity 0.3077960488888623 time: 3946.5098209381104
Logistic Regression 0.29201660568533633 time: 64.40976023674011


## Ensemble Methods 

### Stacking 

In [63]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import time
import time
import numpy as np

np.random.seed(1)

class StackingClassifier():

    def __init__(self, classifiers, metaclassifier):
        self.classifiers = classifiers
        self.metaclassifier = metaclassifier

    def fit(self, X, y):
        for clf in self.classifiers:
            clf.fit(X, y)
        X_meta = self._predict_base(X)
        self.metaclassifier.fit(X_meta, y)
    
    def _predict_base(self, X):
        yhats = []
        for clf in self.classifiers:
            yhat = clf.predict_proba(X)
            yhats.append(yhat)
        yhats = np.concatenate(yhats, axis=1)
        #print(yhats.shape)
        assert len(yhats) == len(X)
        return yhats
    
    def predict(self, X):
        X_meta = self._predict_base(X)
        yhat = self.metaclassifier.predict(X_meta)
        return yhat
    def score(self, X, y):
        yhat = self.predict(X)
        return accuracy_score(y, yhat)
    


classifiers = [LogisticRegression(),
                KNeighborsClassifier(),
                GaussianNB(),
                MultinomialNB(),
                DecisionTreeClassifier()]

meta_classifier = DecisionTreeClassifier()
stacker = StackingClassifier(classifiers, meta_classifier)

def load_car_data(car_file):
    X = []
    y = []
    with open(car_file, mode='r') as fin:
        for line in fin:
            atts = line.strip().split(",")
            X.append(atts[:-1]) #all atts minus the last one
            y.append(atts[-1])
    onehot = OneHotEncoder()
    X = onehot.fit_transform(X).toarray()
    return X, y
# X, y = load_car_data('car.data')
# print('labels:', set(y))
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# stacker.fit(X_train, y_train)
# print('stacker acc:', stacker.score(X_test, y_test))

In [64]:
stacker.fit(X_train, y_train)

In [67]:
print('stacker acc on train:', stacker.score(X_train, y_train))
cv = 5
print('stacker cross-val acc  on train:', np.mean(cross_val_score(stacker.metaclassifier, X_train, y_train, cv=cv)))

stacker acc on train: 0.9534422981674097
stacker cross-val acc  on train: 0.27487492903356514


In [68]:
print('stacker acc on dev:', stacker.score(X_dev,y_dev))

stacker acc on dev: 0.2411764705882353


## Test Label Output 

In [None]:
model =  LogisticRegression()
model.fit(X_train, y_train)

In [None]:
predict_labels = model.predict(X_test)

In [None]:
def test_prediction_to_submit_file(test_lables, filepath):
    with open(filepath, 'w',  newline='') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerows([["Id", "Class"]])
        for i in list(range(len(test_lables))):
            index = '3' + str(i+1)
            writer.writerows([[index, test_lables[i]]])
        writeFile.close()

In [None]:
test_prediction_to_submit_file(predict_labels, "predicted_labels.csv")