In [92]:
import pandas as pd
import numpy as np

import nltk
import re

import gensim

import warnings
warnings.filterwarnings('ignore')

import multiprocessing
from tqdm import tqdm

from sklearn import metrics

In [93]:
train_data = pd.read_csv('data/train_full.csv', error_bad_lines=False) 
train_tweetContent = train_data[['tweet']]
train_labels = train_data['region']

dev_data = pd.read_csv('data/dev_full.csv', error_bad_lines=False)
dev_tweetContent = dev_data[['tweet']]
dev_labels = dev_data['region']

test_data = pd.read_csv('data/test_full.csv', error_bad_lines=False) 
test_tweetContent = test_data[['tweet']]

In [94]:
train_data

Unnamed: 0,region,user,tweet
0,NORTHEAST,USER_6197f95d,Watching LOST
1,NORTHEAST,USER_6197f95d,@USER_89a3500b i did
2,NORTHEAST,USER_6197f95d,"Maneuver so that I can put my team on, hopeful..."
3,NORTHEAST,USER_6197f95d,Darko was eating hamburgers in the locker room...
4,NORTHEAST,USER_6197f95d,Girl pack ya bags i'm bout to take you on a ride!
...,...,...,...
133790,SOUTH,USER_294708bb,Wow...completely forgot that it's #FollowFrida...
133791,SOUTH,USER_294708bb,Some groovy Texas blues music is playing in my...
133792,SOUTH,USER_294708bb,@USER_8cac2975 Haven't seen that one in ages. ...
133793,SOUTH,USER_294708bb,RT @USER_6d0753d3: You should all go and follo...


In [95]:
train_tweetContent

Unnamed: 0,tweet
0,Watching LOST
1,@USER_89a3500b i did
2,"Maneuver so that I can put my team on, hopeful..."
3,Darko was eating hamburgers in the locker room...
4,Girl pack ya bags i'm bout to take you on a ride!
...,...
133790,Wow...completely forgot that it's #FollowFrida...
133791,Some groovy Texas blues music is playing in my...
133792,@USER_8cac2975 Haven't seen that one in ages. ...
133793,RT @USER_6d0753d3: You should all go and follo...


In [96]:
train_labels

0         NORTHEAST
1         NORTHEAST
2         NORTHEAST
3         NORTHEAST
4         NORTHEAST
            ...    
133790        SOUTH
133791        SOUTH
133792        SOUTH
133793        SOUTH
133794        SOUTH
Name: region, Length: 133795, dtype: object

In [97]:
assert(len(train_tweetContent)==len(train_labels))
assert(len(dev_tweetContent)==len(dev_labels))

In [98]:
'''
Preprocess a string.
:parameter
    :param text: string - name of column containing text
    :param lst_stopwords: list - list of stopwords to remove
    :param flg_stemm: bool - whether stemming is to be applied
    :param flg_lemm: bool - whether lemmitisation is to be applied
:return
    cleaned text
'''
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = nltk.word_tokenize(text)    
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    
    return text

In [99]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/yunfei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/yunfei/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/yunfei/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [100]:
lst_stopwords = nltk.corpus.stopwords.words("english")

In [101]:
train_tweetContent = train_tweetContent["tweet"].apply(lambda x: 
          utils_preprocess_text(x, flg_stemm=False, flg_lemm=False, 
          lst_stopwords=lst_stopwords))

dev_tweetContent = dev_tweetContent["tweet"].apply(lambda x: 
          utils_preprocess_text(x, flg_stemm=False, flg_lemm=False, 
          lst_stopwords=lst_stopwords))

test_tweetContent = test_tweetContent["tweet"].apply(lambda x: 
          utils_preprocess_text(x, flg_stemm=False, flg_lemm=False, 
          lst_stopwords=lst_stopwords))

In [102]:
train_tweetContent

0                                             watching lost
1                                             user_89a3500b
2            maneuver put team hopefully sooner live dreams
3         darko eating hamburgers locker room played kni...
4                       girl pack ya bags im bout take ride
                                ...                        
133790    wowcompletely forgot followfriday ahem great p...
133791    groovy texas blues music playing head right ma...
133792           user_8cac2975 havent seen one ages remeber
133793    rt user_6d0753d3 go follow user_b7a77112 theyr...
133794                        user_4cf8655a congratulations
Name: tweet, Length: 133795, dtype: object

In [103]:
# Train Word2Vec model using all the tweets we have
# Tokenize all the tweets by splitting
train_tokens=[]
for tweet in train_tweetContent:
    train_tokens.append(tweet.split())

In [104]:
train_tokens

[['watching', 'lost'],
 ['user_89a3500b'],
 ['maneuver', 'put', 'team', 'hopefully', 'sooner', 'live', 'dreams'],
 ['darko',
  'eating',
  'hamburgers',
  'locker',
  'room',
  'played',
  'knicks',
  'lol'],
 ['girl', 'pack', 'ya', 'bags', 'im', 'bout', 'take', 'ride'],
 ['user_a9cf8f82', 'lol', 'yeah', 'check', 'bro'],
 ['rt',
  'user_5eae722d',
  'inhighschool',
  'mr',
  'stavisky',
  'dnt',
  'lk',
  'breath',
  'smelled',
  'lk',
  'straight',
  'ass',
  'lmao',
  'use',
  '2',
  'chase',
  'us',
  'dn',
  'hall'],
 ['inhighschool',
  'trenton',
  'high',
  'girls',
  'basketball',
  'team',
  'always',
  'best',
  'record',
  'highschool',
  'sports',
  'teams',
  'nothing',
  'changed'],
 ['cheerleading', 'team', 'need', 'mrs', 'grady', 'back'],
 ['girl', 'ashley', 'hines', 'beast', 'cant', 'stop'],
 ['da',
  'high',
  'always',
  'best',
  'fans',
  'going',
  'way',
  'back',
  'inhighschool'],
 ['rip', 'jack', 'daniels', 'burger'],
 ['user_43aad9dc',
  'oh',
  'shes',
  'bac

In [105]:
dev_tokens = []
for tweet in dev_tweetContent:
    dev_tokens.append(tweet.split())
    
test_tokens = []
for tweet in test_tweetContent:
    test_tokens.append(tweet.split())

In [106]:
# combine train dev and test
all_tokens = np.concatenate((train_tokens, dev_tokens, test_tokens))

In [107]:
len(all_tokens) 

157288

## Word2Vec

In [68]:
cores = multiprocessing.cpu_count()
# change "vecotr_size"
w2v_model = gensim.models.word2vec.Word2Vec(all_tokens, vector_size=100, window=8, min_count=1, sg=1, epochs=30, workers=cores)

In [69]:
w2v_model.wv.most_similar('donald')

[('user_62fa06b6', 0.7600353360176086),
 ('combover', 0.7507393956184387),
 ('lawnwit', 0.7232391834259033),
 ('user_c28d47f2', 0.7122253775596619),
 ('trump', 0.686151385307312),
 ('trumps', 0.6613792777061462),
 ('user_8674f459', 0.6577340364456177),
 ('quacks', 0.6566970944404602),
 ('user_49f50d6f', 0.6531112194061279),
 ('balmgtctfu', 0.6511390805244446)]

In [70]:
w2v_words = w2v_model.wv.index_to_key

In [71]:
w2v_words

['rt',
 'lol',
 'u',
 'im',
 'like',
 'get',
 'got',
 'dont',
 'good',
 'go',
 '2',
 'lmao',
 'na',
 'know',
 'love',
 'shit',
 'n',
 'ur',
 'need',
 'one',
 'see',
 'thats',
 'time',
 'right',
 'smh',
 'ya',
 'da',
 'think',
 'ass',
 'cant',
 'back',
 'wit',
 'day',
 'gon',
 'aint',
 'ff',
 'man',
 'want',
 'damn',
 'bout',
 'make',
 'oh',
 'say',
 'wan',
 'still',
 'nigga',
 'really',
 'yall',
 'ok',
 'going',
 'new',
 'fuck',
 'work',
 'well',
 'come',
 'yea',
 'feel',
 'yo',
 'today',
 'girl',
 'said',
 'way',
 'never',
 'jus',
 'ill',
 'twitter',
 'ta',
 'would',
 '4',
 'whats',
 'haha',
 'wat',
 'take',
 'home',
 'people',
 'hey',
 'tell',
 'yu',
 'b',
 'dat',
 'cuz',
 'even',
 'night',
 'didnt',
 'morning',
 'follow',
 'much',
 'look',
 'better',
 'let',
 'last',
 'always',
 'kno',
 'ppl',
 'bad',
 'yeah',
 'lil',
 'bitch',
 'tho',
 'yes',
 'hate',
 'life',
 'tonight',
 'real',
 'r',
 'goin',
 'hell',
 'sleep',
 'thanks',
 'call',
 'dnt',
 'nowplaying',
 'ima',
 'big',
 'â',
 'w

### wAvg W2V

In [73]:
# average Word2Vec
# compute average word2vec for each tweet
train_vectors = [] # the avg-w2v for each sentence/review is stored in this list
for sent in tqdm(train_tokens): # for each tweet
    sent_vec = np.zeros(150) # as word vectors are of zero length 300, 
    cnt_words = 0 # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a tweet
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    train_vectors.append(sent_vec)
print(len(train_vectors))
print(len(train_vectors[0]))

100%|██████████| 133795/133795 [10:37<00:00, 209.77it/s]

133795
150





In [83]:
dev_vectors = [] # the avg-w2v for each sentence/review is stored in this list
for sent in tqdm(dev_tokens): # for each tweet
    sent_vec = np.zeros(150) # as word vectors are of zero length 300, 
    cnt_words = 0 # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a tweet
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    dev_vectors.append(sent_vec)
print(len(dev_vectors))
print(len(dev_vectors[0]))

100%|██████████| 11475/11475 [00:50<00:00, 229.46it/s]

11475
150





In [84]:
test_vectors = [] # the avg-w2v for each sentence/review is stored in this list
for sent in tqdm(test_tokens): # for each tweet
    sent_vec = np.zeros(150) # as word vectors are of zero length 300, 
    cnt_words = 0 # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a tweet
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    test_vectors.append(sent_vec)
print(len(test_vectors))
print(len(test_vectors[0]))

100%|██████████| 12018/12018 [00:47<00:00, 250.82it/s]

12018
150





In [86]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
train_vectors = min_max_scaler.fit_transform(train_vectors)

In [87]:
train_vectors

array([[0.60767412, 0.50948017, 0.57863748, ..., 0.67414562, 0.36560552,
        0.44187854],
       [0.45954024, 0.60511371, 0.51907914, ..., 0.5504011 , 0.40229383,
        0.56046841],
       [0.49631449, 0.50469462, 0.52757634, ..., 0.38569734, 0.4807791 ,
        0.50534024],
       ...,
       [0.33563074, 0.5193974 , 0.6247576 , ..., 0.37998389, 0.44046799,
        0.521466  ],
       [0.41417634, 0.50626835, 0.49226694, ..., 0.52458889, 0.49011018,
        0.48464071],
       [0.42156916, 0.6184511 , 0.54859902, ..., 0.56158413, 0.5088017 ,
        0.49315003]])

In [91]:
## Zero-R baseline
from collections import Counter

majority_class = ""
zero_r_predictions = []

label_counter = Counter(train_labels)
majority_class = label_counter.most_common()[0][0]

for i in range(len(dev_vectors)):
    zero_r_predictions.append(majority_class)
    
f1 = metrics.f1_score(dev_labels, zero_r_predictions, average='micro')
print('F1：', '%.4f' % f1)

F1： 0.3743


In [51]:
majority_class

'NORTHEAST'

In [112]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=1).fit(train_vectors, train_labels)

predicted_labels = clf.predict(dev_vectors)
n=100
f1 = metrics.f1_score(dev_labels, predicted_labels, average='micro')
print(f'KNN F1 on N={n}： {f1:.4f}')

KNN F1 on N=100： 0.2631


In [88]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(n_jobs=-1, verbose=True).fit(train_vectors, train_labels)
predicted_labels = clf.predict(dev_vectors)

f1 = metrics.f1_score(dev_labels, predicted_labels, average='micro')
print('F1：', '%.4f' % f1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


F1： 0.3762
Accuracy： 0.3762


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   12.0s finished


In [33]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

clf = LinearSVC(C=0.12, random_state=42).fit(train_vectors, train_labels)
predicted_labels = clf.predict(dev_vectors)

f1 = metrics.f1_score(dev_labels, predicted_labels, average='micro')
print('F1：', '%.4f' % f1)

F1： 0.4715
Accuracy： 0.4715


In [90]:
from sklearn.neural_network import BernoulliRBM
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=True, epsilon=1e-06,
              hidden_layer_sizes=(100,), learning_rate='adaptive',
              learning_rate_init=0.001, max_iter=10000, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=42, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

mlp.fit(train_vectors, train_labels)
predicted_labels = mlp.predict(dev_vectors)

f1 = metrics.f1_score(dev_labels, predicted_labels, average='micro')
print('F1：', '%.4f' % f1)

F1： 0.3248
Accuracy： 0.3248


In [44]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=0).fit(train_vectors, train_labels)

predicted_labels = clf.predict(dev_vectors)

f1 = metrics.f1_score(dev_labels, predicted_labels, average='micro')
print('F1：', '%.4f' % f1)

F1： 0.2743
Accuracy： 0.3393


In [41]:
from sklearn.ensemble import ExtraTreesClassifier

clf = ExtraTreesClassifier(n_jobs=-1).fit(train_vectors, train_labels)
predicted_labels = clf.predict(dev_vectors)

f1 = metrics.f1_score(dev_labels, predicted_labels, average='micro')
print('F1：', '%.4f' % f1)

F1： 0.1730
Accuracy： 0.3772


In [None]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB().fit(train_vectors, train_labels)
predicted_labels = clf.predict(dev_vectors)

f1 = metrics.f1_score(dev_labels, predicted_labels, average='micro')
print('F1：', '%.4f' % f1)

In [None]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB().fit(train_vectors, train_labels)
predicted_labels = clf.predict(dev_vectors)

f1 = metrics.f1_score(dev_labels, predicted_labels, average='micro')
print('F1：', '%.4f' % f1)

In [40]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB().fit(train_vectors, train_labels)
predicted_labels = clf.predict(dev_vectors)

f1 = metrics.f1_score(dev_labels, predicted_labels, average='micro')
print('F1：', '%.4f' % f1)

F1： 0.1362
Accuracy： 0.3743


### Write to file

In [141]:
clf = LogisticRegression(max_iter=300, n_jobs=-1, verbose=True).fit(train_vectors, train_labels)
predicted_labels = clf.predict(test_vectors)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   37.8s finished


In [142]:
df = pd.DataFrame({"region": predicted_labels})

In [143]:
df.index.name = "id"
df.index += 1

In [144]:
df

Unnamed: 0_level_0,region
id,Unnamed: 1_level_1
1,NORTHEAST
2,SOUTH
3,SOUTH
4,NORTHEAST
5,SOUTH
...,...
12014,SOUTH
12015,NORTHEAST
12016,NORTHEAST
12017,NORTHEAST


In [145]:
df.to_csv('result.csv', index=['id', 'region'])