In [1]:
import csv
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import svm
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import time
from gensim.models import Word2Vec
from nltk.corpus import stopwords  
import nltk
nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91260\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91260\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Read raw text 

In [20]:
train_raw_filepath =  "./Short-Text-Location-Prediction/2019S1-proj2-data_dos/train-raw.tsv"
dev_raw_filepath =  "./Short-Text-Location-Prediction/2019S1-proj2-data_dos/dev-raw.tsv"
test_raw_filepath =  "./Short-Text-Location-Prediction/2019S1-proj2-data_dos/test-raw.tsv"

In [3]:
# read tsv file into a 2D array
def read_tsv(filepath):
    label, text = [], []
    with open(filepath) as raw:
        reader = csv.reader(raw, delimiter="\t", quoting = csv.QUOTE_NONE)
        for row in reader:
            label.append(row[1])
            text.append(row[-1])
    return text, label

# Feature Engineering

## Utility

In [4]:
def is_all_zero(instance):
    for attr in instance:
        if int(attr) != 0:
            return False
    return True

def remove_all_zero_instance(X, y):
    new_X, new_y = [], []
    for i in range(len(y)):
        instance = X[i, :].toarray().sum(axis = 0)
        if is_all_zero(instance) is False:
            new_X.append(X[i])
            new_y.append(y[i])
    return new_X, new_y

## Text Pre-filtering  

In [5]:
def remove_URL(texts):
    for i in list(range(len(texts))):
        texts[i] = re.sub(r"http\S+", "", texts[i])
        
def remove_metion(texts):
    for i in list(range(len(texts))):
        texts[i] = re.sub(r"@\S+", "", texts[i])
        texts[i] = re.sub(r"@", "", texts[i])
        
def remove_hash(texts):
    for i in list(range(len(texts))):
        texts[i] = re.sub(r"#\S+", "", texts[i])
        texts[i] = re.sub(r"#", "", texts[i])

## WHL calculating 

In [6]:
class Word_Counter:
    # Initializer / Instance Attributes
    def __init__(self, size):
        self.total_word_count = [0]*size
        self.Melbourne_word_count = [0]*size
        self.Sydney_word_count = [0]*size 
        self.Perth_word_count = [0]*size        
        self.Brisbane_word_count = [0]*size        
class WLH_lists:
    # Initializer / Instance Attributes
    def __init__(self):
        self.Melbourne_WLH_list = []
        self.Sydney_WLH_list = [] 
        self.Perth_WLH_list = []        
        self.Brisbane_WLH_list = []        

In [7]:
def word_frequency_counter(counts, lables):
    # create a list to store count
    num_words = counts.shape[1]
    counter = Word_Counter(num_words)
    
    for i in list(range(len(lables))):
        instance = counts[i, :]
        if (lables[i] == "Melbourne"):
            counter.Melbourne_word_count += instance.toarray().sum(axis=0)
        elif (lables[i] == "Sydney"):
            counter.Sydney_word_count  += instance.toarray().sum(axis=0)
        elif (lables[i] == "Perth"):
            counter.Perth_word_count += instance.toarray().sum(axis=0)
        elif (lables[i] == "Brisbane"):
            counter.Brisbane_word_count += instance.toarray().sum(axis=0)
    
    counter.total_word_count = counter.Melbourne_word_count + \
                                counter.Sydney_word_count +\
                                counter.Perth_word_count +\
                                counter.Brisbane_word_count
    return counter

In [8]:
def occur_in_n_state(WHL_list):
    count = 0
    for WHL in WHL_list:
        if WHL > 0:
            count += 1
    return count

In [9]:
def calculate_WHL_by_state(word_counter, vectoriser):
    WLH_list_occur_more_than_one_state = []
    WLH_list_one_state = []
    
    #  total number of word used for each word counter
    Melbourne_total_word_num = word_counter.Melbourne_word_count.sum()
    Brisbane_total_word_num = word_counter.Brisbane_word_count.sum()
    Sydney_total_word_num = word_counter.Sydney_word_count.sum()
    Perth_total_word_num = word_counter.Perth_word_count.sum()
    all_state_total_word_num = word_counter.total_word_count.sum()

    for i in list(range(len(word_counter.total_word_count))):    
    # for i in list(range(100)):
        curr_word_list = [] # [WHL, word, frequency,  state]
        curr_word = vectoriser.get_feature_names()[i]

        curr_word_total_prob = word_counter.total_word_count[i]/all_state_total_word_num

        WHL_Mel = (word_counter.Melbourne_word_count[i]/Melbourne_total_word_num)/curr_word_total_prob
        WHL_Syd = (word_counter.Sydney_word_count[i]/Sydney_total_word_num)/curr_word_total_prob
        WHL_Per = (word_counter.Perth_word_count[i]/Perth_total_word_num)/curr_word_total_prob
        WHL_Bri = (word_counter.Brisbane_word_count[i]/Brisbane_total_word_num)/curr_word_total_prob

        WHL_list = [WHL_Mel, WHL_Bri, WHL_Per, WHL_Syd]
        if (occur_in_n_state(WHL_list) > 1):
            max_WHL = max(WHL_list)

            state = [] # (stateName, wordStateCount, wordTotalCount)
            if (WHL_Mel == max_WHL):
                state.append(["Melbourne", word_counter.Melbourne_word_count[i], word_counter.Melbourne_word_count[i]/word_counter.total_word_count[i]])
            if (WHL_Bri == max_WHL):
                state.append(["Brisbane", word_counter.Brisbane_word_count[i], word_counter.Brisbane_word_count[i]/word_counter.total_word_count[i]])
            if (WHL_Per == max_WHL):
                state.append(["Perth", word_counter.Perth_word_count[i], word_counter.Perth_word_count[i]/word_counter.total_word_count[i]])
            if (WHL_Syd == max_WHL):
                state.append(["Sydney", word_counter.Sydney_word_count[i], word_counter.Sydney_word_count[i]/word_counter.total_word_count[i]])
            curr_word_list.append(max_WHL)
            curr_word_list.append(i)
            curr_word_list.append(curr_word)
            curr_word_list.append(state)
            WLH_list_occur_more_than_one_state.append(curr_word_list)
        else:
            max_WHL = max(WHL_list)

            state = [] # (stateName, wordStateCount, wordTotalCount)
            if (WHL_Mel == max_WHL):
                state.append(["Melbourne", word_counter.Melbourne_word_count[i], word_counter.Melbourne_word_count[i]/word_counter.total_word_count[i]])
            elif (WHL_Bri == max_WHL):
                state.append(["Brisbane", word_counter.Brisbane_word_count[i], word_counter.Brisbane_word_count[i]/word_counter.total_word_count[i]])
            elif (WHL_Per == max_WHL):
                state.append(["Perth", word_counter.Perth_word_count[i], word_counter.Perth_word_count[i]/word_counter.total_word_count[i]])
            elif (WHL_Syd == max_WHL):
                state.append(["Sydney", word_counter.Sydney_word_count[i], word_counter.Sydney_word_count[i]/word_counter.total_word_count[i]])
            curr_word_list.append(max_WHL)
            curr_word_list.append(i)
            curr_word_list.append(curr_word)
            curr_word_list.append(state)
            WLH_list_one_state.append(curr_word_list)
    return WLH_list_occur_more_than_one_state, WLH_list_one_state

In [10]:
def groupby_state(WLHs):
    WLH_groupby_state = WLH_lists()
        # Initializer / Instance Attributes

        
    for WLH in WLHs:
        state = WLH[3][0][0]
        if state == "Melbourne":
            WLH_groupby_state.Melbourne_WLH_list.append(WLH)
        elif state == "Sydney":
            WLH_groupby_state.Sydney_WLH_list.append(WLH)
        elif state == "Perth":
            WLH_groupby_state.Perth_WLH_list.append(WLH)            
        elif state == "Brisbane":
            WLH_groupby_state.Brisbane_WLH_list.append(WLH)
    sorted(WLH_groupby_state.Melbourne_WLH_list, key=lambda x: x[0], reverse=True)
    sorted(WLH_groupby_state.Sydney_WLH_list, key=lambda x: x[0], reverse=True)
    sorted(WLH_groupby_state.Perth_WLH_list, key=lambda x: x[0], reverse=True)
    sorted(WLH_groupby_state.Brisbane_WLH_list, key=lambda x: x[0], reverse=True)
    return WLH_groupby_state

## Extracting Top k WHL for each state 

In [11]:
def extract_top_k_index(state_WLH_list, k):
    indice = []
    WLH_top_k = (sorted(state_WLH_list, key=lambda x: x[0], reverse=True))[:k]
    
    for WLH in WLH_top_k:
        indice.append(WLH[1])
    return indice

In [12]:
# select cloumn from X_train by top 10 index for each state
def extract_column_by_index(matrix, indice, vectoriser):
    selected_words = []
    for index in indice:
        selected_words.append(vectoriser.get_feature_names()[index])
    
    result = []
    length = matrix.shape[0]
    for i in list(range(length)):
        row = matrix[i, :].toarray().sum(axis = 0)
        extract = []
        for index in indice:
            extract.append(row[index])
        result.append(extract)
    return result, selected_words
# feature engineering done

## Word2Vec 

In [13]:
# This function take the filepath of the text and return a word2vec object
def generate_word2vec(filepath):
    # read the file into two parts, text and its location
    X_raw, y_train = read_tsv(filepath)

    # remove URL, hash tag and metion
    remove_URL(X_raw)
    remove_hash(X_raw)
    remove_metion(X_raw)

    # remian only letters and numbers
    processed_tweets = [re.sub('[^a-zA-Z-0-9]', ' ', tweet) for tweet in X_raw]

    # make it into one string and out it into a list => the input format
    processed_tweets_in_one = ""
    for tweet in processed_tweets:
        processed_tweets_in_one += tweet
    processed_tweets_in_one = processed_tweets_in_one.lower()
    processed_tweets_in_one = [processed_tweets_in_one]

    # extract each word and make then into 1 list
    all_words = [nltk.word_tokenize(tweet) for tweet in processed_tweets_in_one]

    # Removing Stop Words
    for i in range(len(all_words)):  
        all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

    # Create word2vec    
    return Word2Vec(all_words, min_count=2)

## Put all together => Feature Engineering Function

In [14]:
def preprocessing_test(filepath, selected_words):
    # read the file into two parts, text and its location
    X_raw, y = read_tsv(filepath)
    
    # remove URL, hash tag and metion
    remove_URL(X_raw)
#     remove_hash(X_raw)
#     remove_metion(X_raw)
    
    # initilaze vectoriser
    vectoriser = CountVectorizer(stop_words="english", vocabulary=selected_words)
    X_sparse = vectoriser.fit_transform(X_raw)
    
    
    # make sparse matrix into 2D list
    X = []
    for i in list(range(len(y))):
        X.append(X_sparse[i, :].toarray().sum(axis = 0))
    return X, y

In [15]:
#This function takes in 
#@param: filepath => The filepath of the corpus
def preprocessing_train(filepath):
    # read the file into two parts, text and its location
    X_raw, y_train = read_tsv(filepath)
    
    # remove URL, hash tag and metion
    remove_URL(X_raw)
    remove_hash(X_raw)
    remove_metion(X_raw)
    
    # initilaze vectoriser
    vectoriser = CountVectorizer(stop_words="english", min_df = 15 )
    X_train = vectoriser.fit_transform(X_raw)
    
    # calculating WLH
    word_counter = word_frequency_counter(X_train, y_train)

    # count frequency for each word by state
    WLH_list_occur_more_than_one_state, WLH_list_one_state = calculate_WHL_by_state(word_counter, vectoriser)

    # group WLH by state
    WLH_lists_gourpby_state = groupby_state(WLH_list_occur_more_than_one_state)
    
    # select top k WHL
    num_features = len(vectoriser.get_feature_names())
    percent = 0.14
    k = percent*num_features/4
    k = int(k)
    k = 300
    
    # extract top 10 WLH index for each state
    top_k_indice_for_each_state = []
    top_k_indice_for_each_state.extend(extract_top_k_index(WLH_lists_gourpby_state.Brisbane_WLH_list, k))
    top_k_indice_for_each_state.extend(extract_top_k_index(WLH_lists_gourpby_state.Melbourne_WLH_list, k))
    top_k_indice_for_each_state.extend(extract_top_k_index(WLH_lists_gourpby_state.Perth_WLH_list, k))
    top_k_indice_for_each_state.extend(extract_top_k_index(WLH_lists_gourpby_state.Sydney_WLH_list, k))
    
    # extract columns
    new_train, selected_words = extract_column_by_index(X_train, top_k_indice_for_each_state, vectoriser)
    return new_train, y_train, selected_words

In [16]:
#This function takes in 
#@param: filepath => The filepath of the corpus
def get_top_k_index(filepath):
    # read the file into two parts, text and its location
    X_raw, y_train = read_tsv(filepath)
    
    # remove URL, hash tag and metion
    remove_URL(X_raw)
    remove_hash(X_raw)
    remove_metion(X_raw)
    
    # initilaze vectoriser
    vectoriser = CountVectorizer(stop_words="english", min_df = 15 )
    X_train = vectoriser.fit_transform(X_raw)
    
    # calculating WLH
    word_counter = word_frequency_counter(X_train, y_train)

    # count frequency for each word by state
    WLH_list_occur_more_than_one_state, WLH_list_one_state = calculate_WHL_by_state(word_counter, vectoriser)

    # group WLH by state
    WLH_lists_gourpby_state = groupby_state(WLH_list_occur_more_than_one_state)
    
    # select top k WHL
    num_features = len(vectoriser.get_feature_names())
    percent = 0.14
    k = percent*num_features/4
    k = int(k)
    k = 20
    
    # extract top 10 WLH index for each state
    top_k_indice_for_each_state = []
    top_k_indice_for_each_state.extend(extract_top_k_index(WLH_lists_gourpby_state.Brisbane_WLH_list, k))
    top_k_indice_for_each_state.extend(extract_top_k_index(WLH_lists_gourpby_state.Melbourne_WLH_list, k))
    top_k_indice_for_each_state.extend(extract_top_k_index(WLH_lists_gourpby_state.Perth_WLH_list, k))
    top_k_indice_for_each_state.extend(extract_top_k_index(WLH_lists_gourpby_state.Sydney_WLH_list, k))
    
    return top_k_indice_for_each_state, vectoriser

In [17]:
# select cloumn from X_train by top 10 index for each state
def get_selected_words(indice, vectoriser):
    return [vectoriser.get_feature_names()[index] for index in indice]

In [18]:
def print_0_percet(X, y):
    num_instance = X.shape[0]
    X_no_zero, y_no_zero = remove_all_zero_instance(X, y)

    num_all_zero_instance = num_instance - len(X_no_zero.shape[0])
    print("The size of total dataset is %d"%num_instance)
    print("The size of all zero instance in this dataset is %d"%num_all_zero_instance)
    print("The ratio of all zero instances is %f "%(num_all_zero_instance/num_instance))

## Execution of feature engineering

In [90]:
word2vec = generate_word2vec(train_raw_filepath)

In [95]:
len(word2vec.wv.vocab.keys())

32159

In [111]:
top_k_indice, vectoriser = get_top_k_index(train_raw_filepath)

In [116]:
selected_words = get_selected_words(top_k_indice, vectoriser)

In [120]:
word2vec.wv.similarity('caesar', 'caesar')

1.0000000000000002

In [119]:
word2vec.wv.most_similar('caesar')

[('convertir', 0.4111023545265198),
 ('crc', 0.38248711824417114),
 ('obscure', 0.3560269773006439),
 ('ages', 0.34935683012008667),
 ('mois', 0.34896236658096313),
 ('verandah', 0.3439221680164337),
 ('udced', 0.3431850075721741),
 ('uc559', 0.3405205309391022),
 ('tame', 0.3397553265094757),
 ('pedir', 0.33727142214775085)]

In [16]:
X_train, y_train, selected_words = preprocessing_train(train_raw_filepath)

In [17]:
print_0_percet(X_train, y_train)

The size of total dataset is 103364
The size of all zero instance in this dataset is 75395
The ratio of all zero instances is 0.729413 


In [18]:
X_dev, y_dev = preprocessing_test(dev_raw_filepath, selected_words)
print_0_percet(X_dev, y_dev)

The size of total dataset is 37316
The size of all zero instance in this dataset is 27187
The ratio of all zero instances is 0.728561 


In [19]:
X_test, y_test = preprocessing_test(test_raw_filepath, selected_words)
print_0_percet(X_test, y_test)

The size of total dataset is 108148
The size of all zero instance in this dataset is 83376
The ratio of all zero instances is 0.770944 


# MI and ChiSq

In [21]:
X_train_raw, y_train = read_tsv(train_raw_filepath)
X_dev_raw, y_dev = read_tsv(dev_raw_filepath)
X_test_raw, y_test = read_tsv(test_raw_filepath)

In [22]:
remove_URL(X_train_raw)
remove_hash(X_train_raw)
remove_metion(X_train_raw)

In [23]:
remove_URL(X_test_raw)
remove_hash(X_test_raw)
remove_metion(X_test_raw)

In [24]:
vectoriser = CountVectorizer(stop_words='english')
X_train = vectoriser.fit_transform(X_train_raw)

In [25]:
X_test = vectoriser.transform(X_test_raw)

In [53]:
X_test.shape

(108148, 69932)

In [54]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import mutual_info_classif
mi = SelectKBest(score_func=mutual_info_classif, k=10)
# X_train_mi = mi.fit_transform(X_train,y_train)
# X_test_mi = mi.transform(X_test)

# print(X_test_mi.shape, X_train_mi.shape)

# for feat_num in mi.get_support(indices=True):
#     print(vectoriser.get_feature_names()[feat_num])

In [61]:
X_train_mi = mi.fit_transform(X_train[:10] ,y_train[:10])

In [62]:
X_train_mi.shape

(10, 10)

In [63]:
for feat_num in mi.get_support(indices=True):
    print(vectoriser.get_feature_names()[feat_num])

australia
hoping
joel
time
u2026
ud83d
ude00
welcome
wizard
wonderful


# Model Training & Evaluation

In [20]:
models = [DummyClassifier(strategy='most_frequent'),
          GaussianNB(),
          MultinomialNB(),
          LogisticRegression()]
titles = ['Zero-R',
          'GNB',
          'MNB',
          'Logistic Regression']

In [21]:
# read csv file
# i = 1
# train_X, train_y = load_dataset(train_filepath[i])
X_dev_no_0, y_dev_no_0 = remove_all_zero_instance(X_dev, y_dev)
# dev_X, dev_y = load_dataset(dev_filepath[i])


# try each model without feature selection
for title, model in zip(titles, models):
    start = time.time()
    model.fit(X_train, y_train)
    acc = np.mean(cross_val_score(model, X_dev, y_dev, cv=10))
#     acc = np.mean(cross_val_score(model, X_train, y_train, cv=10))
    end = time.time()
    t = end - start
    print(title, acc, 'time:', t)

Zero-R 0.25 time: 27.401124715805054
GNB 0.2981835844959957 time: 30.704606771469116
MNB 0.32465994139537885 time: 21.044564485549927
Logistic Regression 0.3263209327518871 time: 27.389904022216797


In [22]:
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [23]:
predict_labels = model.predict(X_test)

In [24]:
def test_prediction_to_submit_file(test_lables, filepath):
    with open(filepath, 'w',  newline='') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerows([["Id", "Class"]])
        for i in list(range(len(test_lables))):
            index = '3' + str(i+1)
            writer.writerows([[index, test_lables[i]]])
        writeFile.close()

In [25]:
test_prediction_to_submit_file(predict_labels, "predicted_labels.csv")

In [26]:
predict_labels[0]

'Sydney'

## 