In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectKBest, chi2

In [2]:
import csv
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import svm
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import time
from gensim.models import Word2Vec
from nltk.corpus import stopwords  
import nltk
from nltk.stem.lancaster import LancasterStemmer
import sys
import unicodedata

In [3]:
train_raw_filepath =  "./2019S1-proj2-data_dos/train-raw.tsv"
dev_raw_filepath =  "./2019S1-proj2-data_dos/dev-raw.tsv"
test_raw_filepath =  "./2019S1-proj2-data_dos/test-raw.tsv"

In [4]:
# read tsv file into a 2D array
def read_tsv(filepath):
    label, text = [], []
    with open(filepath) as raw:
        reader = csv.reader(raw, delimiter="\t", quoting = csv.QUOTE_NONE)
        for row in reader:
            label.append(row[1])
            text.append(row[-1])
    return text, label

# Text Pre-filtering

In [5]:
def remove_URL(texts):
    for i in list(range(len(texts))):
        texts[i] = re.sub(r"http\S+", "", texts[i])
        
def remove_metion(texts):
    for i in list(range(len(texts))):
        texts[i] = re.sub(r"@\S+", "", texts[i])
        texts[i] = re.sub(r"@", "", texts[i])
        
def remove_hash(texts):
    for i in list(range(len(texts))):
        texts[i] = re.sub(r"#\S+", "", texts[i])
        texts[i] = re.sub(r"#", "", texts[i])
def remove_unicode(texts):
    for i in list(range(len(texts))):
        texts[i] = re.sub(r"\\u[a-zA-Z0-9][a-zA-Z0-9][a-zA-Z0-9][a-zA-Z0-9]", "", texts[i])

# a table structure to hold the different punctuation used
tbl = dict.fromkeys(i for i in range(sys.maxunicode)
                    if unicodedata.category(chr(i)).startswith('P'))
# method to remove punctuations from sentences.
def remove_punctuation(text):
    return text.translate(tbl)

# Chi-square feature selection

In [6]:
# read tsv file into a 2D array
def read_tsv_2(filepath, city_name):
    label, text = [], []
    with open(filepath) as raw:
        reader = csv.reader(raw, delimiter="\t", quoting = csv.QUOTE_NONE)
        for row in reader:
            label.append(row[1] == city_name)
            text.append(row[-1])
    return text, label

In [7]:
def generate_top_k_words_for_each_city(num, city_name):
    X_raw, y_train = read_tsv_2(train_raw_filepath, city_name)
    remove_URL(X_raw)
    remove_hash(X_raw)
    remove_metion(X_raw)
    remove_unicode(X_raw)
    X_raw = [ remove_punctuation(text) for text in X_raw]
    
    # count word frequency
    vectoriser = CountVectorizer(stop_words="english")
    X_train = vectoriser.fit_transform(X_raw)
    
    # select feature based on chi-sq static
    x2 = SelectKBest(chi2, k = num)
    x2.fit_transform(X_train, y_train)
    
    # write into file
#     filepath = city_name + "_top_k_words.csv"
    
    top_k_words = []
    for feat_num in x2.get_support(indices=True):
       top_k_words.append(vectoriser.get_feature_names()[feat_num])
    
#     with open(filepath, 'w',  newline='') as writeFile:
#         writer = csv.writer(writeFile)
#         writer.writerows([top_k_words])
#         writeFile.close()
    return top_k_words

In [8]:
cities = ["Melbourne", "Sydney", "Perth", "Brisbane"]
top_k_for_each_city = []

In [0]:
for city in cities:
    top_k_for_each_city.append(generate_top_k_words_for_each_city(10000, city))

In [0]:
# set difference 
top_k_for_each_city = list(set().union(top_k_for_each_city[0], top_k_for_each_city[1], top_k_for_each_city[2], top_k_for_each_city[3]))

In [0]:
len(top_k_for_each_city)

# Training

In [166]:
def preprocess_3(filepath, filepath2):
    X_raw, y_train = read_tsv(filepath)
    remove_URL(X_raw)
#     remove_hash(X_raw)
#     remove_metion(X_raw)
#     remove_unicode(X_raw)
#     X_raw = [ remove_punctuation(text) for text in X_raw]
    
    X_raw2, y_train2 = read_tsv(filepath2)
    remove_URL(X_raw2)
#     remove_hash(X_raw)
#     remove_metion(X_raw)
#     remove_unicode(X_raw)
    X_raw2 = [ remove_punctuation(text) for text in X_raw2]
    X_raw.extend(X_raw2)
    y_train.extend(y_train2)
    
    X_raw3, y_train3 = read_tsv(test_raw_filepath)
    remove_URL(X_raw3)
#     remove_hash(X_raw)
#     remove_metion(X_raw)
#     remove_unicode(X_raw)
#     X_raw3 = [ remove_punctuation(text) for text in X_raw3]
    
    X_raw.extend(X_raw3)
    return X_raw, y_train

In [264]:
def preprocess_2(filepath, filepath2):
    X_raw, y_train = read_tsv(filepath)
    remove_URL(X_raw)
#     remove_hash(X_raw)
#     remove_metion(X_raw)
#     remove_unicode(X_raw)
    X_raw = [ remove_punctuation(text) for text in X_raw]
    
    X_raw2, y_train2 = read_tsv(filepath2)
    remove_URL(X_raw2)
#     remove_hash(X_raw)
#     remove_metion(X_raw)
#     remove_unicode(X_raw)
    X_raw2 = [ remove_punctuation(text) for text in X_raw2]
    
    
    X_raw.extend(X_raw2)
    y_train.extend(y_train2)
    return X_raw, y_train

In [265]:
def preprocess(filepath):
    X_raw, y_train = read_tsv(filepath)
    remove_URL(X_raw)
#     remove_hash(X_raw)
#     remove_metion(X_raw)
#     remove_unicode(X_raw)
    X_raw = [ remove_punctuation(text) for text in X_raw]

    return X_raw, y_train

# Select most common words from test data

# Prediction 

In [321]:
vectoriser = CountVectorizer(stop_words='english',\
                             min_df = 1
                            )

In [322]:
X_text, y_train = preprocess_2(train_raw_filepath, dev_raw_filepath)

In [323]:
len(X_text)

140680

In [324]:
X_train = vectoriser.fit_transform(X_text)

In [329]:
X_train.shape

(140680, 200424)

In [326]:
from sklearn.ensemble import VotingClassifier

In [345]:
vs = VotingClassifier(estimators=[('MNB', MultinomialNB()), ('BNB', BernoulliNB()),  ('BNB2', BernoulliNB())], voting='hard')

In [346]:
vs = vs.fit(X_train, y_train)

In [None]:
vs.predict(X)

In [220]:
BNB = BernoulliNB()

In [221]:
BNB.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [79]:
MNB = MultinomialNB()

In [68]:
LR = LogisticRegression()

In [80]:
MNB.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [69]:
LR.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [60]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

In [61]:
rf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [62]:
def test_prediction_to_submit_file(test_lables, filepath):
    with open(filepath, 'w',  newline='') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerows([["Id", "Class"]])
        for i in list(range(len(test_lables))):
            index = '3' + str(i+1)
            writer.writerows([[index, test_lables[i]]])
        writeFile.close()

In [347]:
X_test_text, y_test = preprocess(test_raw_filepath)

In [349]:
X_test.shape

(108148, 200424)

In [348]:
X_test = vectoriser.transform(X_test_text)

In [81]:
predict_labels = MNB.predict(X_test)

In [70]:
predict_labels = LR.predict(X_test)

In [65]:
predict_labels = rf.predict(X_test)

In [144]:
predict_labels = BNB.predict(X_test)

In [350]:
predict_labels = vs.predict(X_test)

In [351]:
filepath = "predictLabel20424.voting.csv"

In [352]:
test_prediction_to_submit_file(predict_labels, filepath)