In [153]:
import pandas as pd
import numpy as np

import nltk
import re

import gensim

import warnings
warnings.filterwarnings('ignore')

import multiprocessing
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer

In [99]:
tweetData = pd.read_csv('data/train_full.csv', error_bad_lines=False) 
train_tweetContent = tweetData[['tweet']]
train_labels = tweetData['region']

tweetData = pd.read_csv('data/dev_full.csv', error_bad_lines=False)
dev_tweetContent = tweetData[['tweet']]
dev_labels = tweetData['region']

tweetData = pd.read_csv('data/test_full.csv', error_bad_lines=False) 
test_tweetContent = tweetData[['tweet']]
test_tweetID = tweetData['region']

In [100]:
train_tweetContent

Unnamed: 0,tweet
0,Watching LOST
1,@USER_89a3500b i did
2,"Maneuver so that I can put my team on, hopeful..."
3,Darko was eating hamburgers in the locker room...
4,Girl pack ya bags i'm bout to take you on a ride!
...,...
133790,Wow...completely forgot that it's #FollowFrida...
133791,Some groovy Texas blues music is playing in my...
133792,@USER_8cac2975 Haven't seen that one in ages. ...
133793,RT @USER_6d0753d3: You should all go and follo...


In [101]:
'''
Preprocess a string.
:parameter
    :param text: string - name of column containing text
    :param lst_stopwords: list - list of stopwords to remove
    :param flg_stemm: bool - whether stemming is to be applied
    :param flg_lemm: bool - whether lemmitisation is to be applied
:return
    cleaned text
'''
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    
    # remove @mentions (@USER)
#     text = re.sub(r'\B@USER\w+', '', text)
    
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = nltk.word_tokenize(text)    
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    
    return text

In [102]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/yunfei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/yunfei/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/yunfei/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [103]:
lst_stopwords = nltk.corpus.stopwords.words("english")

In [104]:
train_tweetContent["tweet_clean"] = train_tweetContent["tweet"].apply(lambda x: 
          utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, 
          lst_stopwords=lst_stopwords))

dev_tweetContent["tweet_clean"] = dev_tweetContent["tweet"].apply(lambda x: 
          utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, 
          lst_stopwords=lst_stopwords))

test_tweetContent["tweet_clean"] = test_tweetContent["tweet"].apply(lambda x: 
          utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, 
          lst_stopwords=lst_stopwords))

In [105]:
train_tweetContent

Unnamed: 0,tweet,tweet_clean
0,Watching LOST,watching lost
1,@USER_89a3500b i did,user_89a3500b
2,"Maneuver so that I can put my team on, hopeful...",maneuver put team hopefully sooner live dream
3,Darko was eating hamburgers in the locker room...,darko eating hamburger locker room played knic...
4,Girl pack ya bags i'm bout to take you on a ride!,girl pack ya bag im bout take ride
...,...,...
133790,Wow...completely forgot that it's #FollowFrida...,wowcompletely forgot followfriday ahem great p...
133791,Some groovy Texas blues music is playing in my...,groovy texas blue music playing head right man...
133792,@USER_8cac2975 Haven't seen that one in ages. ...,user_8cac2975 havent seen one age remeber
133793,RT @USER_6d0753d3: You should all go and follo...,rt user_6d0753d3 go follow user_b7a77112 theyr...


In [106]:
train_tweetContent = train_tweetContent['tweet_clean'].astype(str)
dev_tweetContent = dev_tweetContent['tweet_clean'].astype(str)
test_tweetContent = test_tweetContent['tweet_clean'].astype(str)

In [109]:
train_tweetContent

0                                             watching lost
1                                             user_89a3500b
2             maneuver put team hopefully sooner live dream
3         darko eating hamburger locker room played knic...
4                        girl pack ya bag im bout take ride
                                ...                        
133790    wowcompletely forgot followfriday ahem great p...
133791    groovy texas blue music playing head right man...
133792            user_8cac2975 havent seen one age remeber
133793    rt user_6d0753d3 go follow user_b7a77112 theyr...
133794                         user_4cf8655a congratulation
Name: tweet_clean, Length: 133795, dtype: object

In [91]:
word_vectorizer = TfidfVectorizer(
        sublinear_tf=True,
        ngram_range=(1, 2),
        min_df=10)
word_vectorizer.fit(train_tweetContent)
tfidf_words = word_vectorizer.get_feature_names()

In [92]:
tfidf_words

['00',
 '09',
 '0_o',
 '0n',
 '10',
 '10 min',
 '10 minute',
 '100',
 '1000',
 '10000',
 '101',
 '1030',
 '106',
 '106 park',
 '10am',
 '10th',
 '11',
 '112',
 '1130',
 '11pm',
 '11th',
 '12',
 '12 club',
 '12 hour',
 '120',
 '1230',
 '12th',
 '13',
 '130',
 '13th',
 '14',
 '140',
 '140 character',
 '15',
 '15 min',
 '150',
 '15th',
 '16',
 '16 mangoville',
 '16 pregnant',
 '17',
 '18',
 '188th',
 '188th jamaica',
 '18th',
 '19',
 '19th',
 '1st',
 '1st time',
 '20',
 '20 min',
 '20 minute',
 '200',
 '2000',
 '2009',
 '2010',
 '2011',
 '2012',
 '2020',
 '20th',
 '21',
 '218',
 '218 linden',
 '21st',
 '22',
 '22nd',
 '23',
 '230',
 '24',
 '24 hour',
 '247',
 '25',
 '250',
 '25th',
 '26',
 '26th',
 '27',
 '28',
 '2b',
 '2day',
 '2days',
 '2events1night',
 '2getha',
 '2gether',
 '2hrs',
 '2k',
 '2k10',
 '2mar',
 '2mm',
 '2moro',
 '2morro',
 '2morrow',
 '2mrw',
 '2nd',
 '2night',
 '2nite',
 '2pac',
 '2pm',
 '30',
 '30 min',
 '30 minute',
 '300',
 '3000',
 '31',
 '31 next',
 '313',
 '313 one

In [84]:
f = open('tfidf-words','w')
f.write('tfidf words ' + '\n')
for word in tfidf_words:
    f.write(word + '\n')
f.close()
print('save tfidf words successfully')

save tfidf words successfully


In [93]:
train_features = word_vectorizer.transform(train_tweetContent)
dev_features   = word_vectorizer.transform(dev_tweetContent)
test_features  = word_vectorizer.transform(test_tweetContent)

In [94]:
train_features.get_shape()

(133795, 12422)

In [113]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

clf = MultinomialNB().fit(train_features, train_labels)
predicted_labels = clf.predict(dev_features)

accuracy = metrics.accuracy_score(dev_labels, predicted_labels)
print('Accuracy：', '%.4f' % accuracy)

Accuracy： 0.4563


In [128]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=5000, n_jobs=-1).fit(train_features, train_labels)
predicted_labels = clf.predict(dev_features)

accuracy = metrics.accuracy_score(dev_labels, predicted_labels)
print('Accuracy：', '%.4f' % accuracy)

Accuracy： 0.4476


In [127]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=100, random_state=0, n_jobs=-1).fit(train_features, train_labels)
predicted_labels = clf.predict(dev_features)

accuracy = metrics.accuracy_score(dev_labels, predicted_labels)
print('Accuracy：', '%.4f' % accuracy)

Accuracy： 0.4407


## Word2Vec

In [129]:
# Train your own Word2Vec model using your own text corpus
list_of_tweets=[]
for tweet in train_tweetContent:
    list_of_tweets.append(tweet.split())

In [146]:
list_of_tweets

[['watching', 'lost'],
 ['user_89a3500b'],
 ['maneuver', 'put', 'team', 'hopefully', 'sooner', 'live', 'dream'],
 ['darko', 'eating', 'hamburger', 'locker', 'room', 'played', 'knicks', 'lol'],
 ['girl', 'pack', 'ya', 'bag', 'im', 'bout', 'take', 'ride'],
 ['user_a9cf8f82', 'lol', 'yeah', 'check', 'bro'],
 ['rt',
  'user_5eae722d',
  'inhighschool',
  'mr',
  'stavisky',
  'dnt',
  'lk',
  'breath',
  'smelled',
  'lk',
  'straight',
  'as',
  'lmao',
  'use',
  '2',
  'chase',
  'u',
  'dn',
  'hall'],
 ['inhighschool',
  'trenton',
  'high',
  'girl',
  'basketball',
  'team',
  'always',
  'best',
  'record',
  'highschool',
  'sport',
  'team',
  'nothing',
  'changed'],
 ['cheerleading', 'team', 'need', 'mr', 'grady', 'back'],
 ['girl', 'ashley', 'hines', 'beast', 'cant', 'stop'],
 ['da',
  'high',
  'always',
  'best',
  'fan',
  'going',
  'way',
  'back',
  'inhighschool'],
 ['rip', 'jack', 'daniel', 'burger'],
 ['user_43aad9dc',
  'oh',
  'shes',
  'back',
  'look',
  'like',
 

In [162]:
dev_list_of_tweets = []
for tweet in dev_tweetContent:
    dev_list_of_tweets.append(tweet.split())
    
test_list_of_tweets = []
for tweet in test_tweetContent:
    test_list_of_tweets.append(tweet.split())

In [155]:
cores = multiprocessing.cpu_count()
w2v_model = gensim.models.word2vec.Word2Vec(list_of_tweets, vector_size=100, window=8, min_count=1, sg=1, epochs=30, workers=cores)

In [156]:
w2v_model.wv.most_similar('donald')

[('trump', 0.8459528088569641),
 ('user_c28d47f2', 0.8311812877655029),
 ('lawnwit', 0.8258400559425354),
 ('user_62fa06b6', 0.812321662902832),
 ('combover', 0.8016952872276306),
 ('balmgtctfu', 0.7086427211761475),
 ('user_8674f459', 0.7012542486190796),
 ('user_b41c0ce1', 0.6884254217147827),
 ('user_c241c564', 0.6865020990371704),
 ('airbrushed', 0.6844445466995239)]

In [157]:
w2v_words = w2v_model.wv.index_to_key

### wAvg W2V

In [160]:
# average Word2Vec
# compute average word2vec for each tweet
train_vectors = [] # the avg-w2v for each sentence/review is stored in this list
for sent in tqdm(list_of_tweets): # for each tweet
    sent_vec = np.zeros(100) # as word vectors are of zero length 300, 
    cnt_words = 0 # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a tweet
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    train_vectors.append(sent_vec)
print(len(train_vectors))
print(len(train_vectors[0]))

100%|██████████| 133795/133795 [08:28<00:00, 263.33it/s]

133795
100





In [164]:
dev_vectors = [] # the avg-w2v for each sentence/review is stored in this list
for sent in tqdm(dev_list_of_tweets): # for each tweet
    sent_vec = np.zeros(100) # as word vectors are of zero length 300, 
    cnt_words = 0 # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a tweet
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    dev_vectors.append(sent_vec)
print(len(dev_vectors))
print(len(dev_vectors[0]))

100%|██████████| 11475/11475 [01:48<00:00, 105.81it/s]

11475
100





In [167]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(n_jobs=-1).fit(train_vectors, train_labels)
predicted_labels = clf.predict(dev_vectors)

accuracy = metrics.accuracy_score(dev_labels, predicted_labels)
print('Accuracy：', '%.4f' % accuracy)

Accuracy： 0.4568


In [168]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=100, random_state=0, n_jobs=-1).fit(train_vectors, train_labels)
predicted_labels = clf.predict(dev_vectors)

accuracy = metrics.accuracy_score(dev_labels, predicted_labels)
print('Accuracy：', '%.4f' % accuracy)

Accuracy： 0.4342


### TFIDF Weighted

In [169]:
tfidf_dict = dict(zip(word_vectorizer.get_feature_names(), list(word_vectorizer.idf_)))

In [171]:
# TF-IDF weighted Word2Vec
tfidf_feat = word_vectorizer.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_sent_vectors = [] # the tfidf-w2v for each sentence/review is stored in this list
row=0
for sent in tqdm(list_of_tweets): # for each review/sentence 
    sent_vec = np.zeros(100) # as word vectors are of zero length
    weight_sum = 0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words and word in tfidf_feat:
            vec = w2v_model.wv[word]
#             tf_idf = tf_idf_matrix[row, tfidf_feat.index(word)]
            # to reduce the computation we are 
            # dictionary[word] = idf value of word in whole courpus
            # sent.count(word) = tf valeus of word in this review
            tf_idf = tfidf_dict[word]*(sent.count(word)/len(sent))
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1

100%|██████████| 133795/133795 [11:54<00:00, 187.21it/s]


In [172]:
# TF-IDF weighted Word2Vec
tfidf_feat = word_vectorizer.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_dev_vectors = [] # the tfidf-w2v for each sentence/review is stored in this list
row=0
for sent in tqdm(dev_list_of_tweets): # for each review/sentence 
    sent_vec = np.zeros(100) # as word vectors are of zero length
    weight_sum = 0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words and word in tfidf_feat:
            vec = w2v_model.wv[word]
#             tf_idf = tf_idf_matrix[row, tfidf_feat.index(word)]
            # to reduce the computation we are 
            # dictionary[word] = idf value of word in whole courpus
            # sent.count(word) = tf valeus of word in this review
            tf_idf = tfidf_dict[word]*(sent.count(word)/len(sent))
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        sent_vec /= weight_sum
    tfidf_dev_vectors.append(sent_vec)
    row += 1

100%|██████████| 11475/11475 [02:04<00:00, 92.25it/s] 


In [173]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(n_jobs=-1).fit(tfidf_sent_vectors, train_labels)
predicted_labels = clf.predict(tfidf_dev_vectors)

accuracy = metrics.accuracy_score(dev_labels, predicted_labels)
print('Accuracy：', '%.4f' % accuracy)

Accuracy： 0.4464


In [174]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=100, random_state=0, n_jobs=-1).fit(tfidf_sent_vectors, train_labels)
predicted_labels = clf.predict(tfidf_dev_vectors)

accuracy = metrics.accuracy_score(dev_labels, predicted_labels)
print('Accuracy：', '%.4f' % accuracy)

Accuracy： 0.4355


In [175]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

In [176]:
clf = MLPClassifier(max_iter=200, verbose=True)

print('corss-val acc:', np.mean(cross_val_score(clf, tfidf_sent_vectors, train_labels, cv=5)))
clf.fit(tfidf_sent_vectors, train_labels)

predictions = clf.predict(tfidf_dev_vectors)
accuracy = metrics.accuracy_score(dev_labels, predicted_labels)
print('Accuracy：', '%.4f' % accuracy)

Iteration 1, loss = 1.21117385
Iteration 2, loss = 1.19125135
Iteration 3, loss = 1.18418050
Iteration 4, loss = 1.17866570
Iteration 5, loss = 1.17384509
Iteration 6, loss = 1.16957640
Iteration 7, loss = 1.16578152
Iteration 8, loss = 1.16231809
Iteration 9, loss = 1.15910689
Iteration 10, loss = 1.15642631
Iteration 11, loss = 1.15348780
Iteration 12, loss = 1.15116327
Iteration 13, loss = 1.14855000
Iteration 14, loss = 1.14658179
Iteration 15, loss = 1.14437434
Iteration 16, loss = 1.14302628
Iteration 17, loss = 1.14116128
Iteration 18, loss = 1.13978567
Iteration 19, loss = 1.13800513
Iteration 20, loss = 1.13668684
Iteration 21, loss = 1.13483711
Iteration 22, loss = 1.13375134
Iteration 23, loss = 1.13274703
Iteration 24, loss = 1.13202294
Iteration 25, loss = 1.13010168
Iteration 26, loss = 1.12881036
Iteration 27, loss = 1.12825547
Iteration 28, loss = 1.12705595
Iteration 29, loss = 1.12641724
Iteration 30, loss = 1.12527864
Iteration 31, loss = 1.12475807
Iteration 32, los

Iteration 55, loss = 1.10959793
Iteration 56, loss = 1.10898270
Iteration 57, loss = 1.10940016
Iteration 58, loss = 1.10853245
Iteration 59, loss = 1.10804566
Iteration 60, loss = 1.10789243
Iteration 61, loss = 1.10757305
Iteration 62, loss = 1.10761329
Iteration 63, loss = 1.10665456
Iteration 64, loss = 1.10593626
Iteration 65, loss = 1.10608072
Iteration 66, loss = 1.10581988
Iteration 67, loss = 1.10514637
Iteration 68, loss = 1.10464144
Iteration 69, loss = 1.10440516
Iteration 70, loss = 1.10423375
Iteration 71, loss = 1.10483018
Iteration 72, loss = 1.10374148
Iteration 73, loss = 1.10383522
Iteration 74, loss = 1.10338107
Iteration 75, loss = 1.10374797
Iteration 76, loss = 1.10268698
Iteration 77, loss = 1.10234057
Iteration 78, loss = 1.10228513
Iteration 79, loss = 1.10192979
Iteration 80, loss = 1.10158446
Iteration 81, loss = 1.10142152
Iteration 82, loss = 1.10162698
Iteration 83, loss = 1.10058993
Iteration 84, loss = 1.10133781
Iteration 85, loss = 1.10062099
Iteratio

Iteration 108, loss = 1.09634519
Iteration 109, loss = 1.09671673
Iteration 110, loss = 1.09591069
Iteration 111, loss = 1.09623711
Iteration 112, loss = 1.09545892
Iteration 113, loss = 1.09580642
Iteration 114, loss = 1.09501672
Iteration 115, loss = 1.09534845
Iteration 116, loss = 1.09523170
Iteration 117, loss = 1.09491138
Iteration 118, loss = 1.09451860
Iteration 119, loss = 1.09501769
Iteration 120, loss = 1.09424595
Iteration 121, loss = 1.09444774
Iteration 122, loss = 1.09389038
Iteration 123, loss = 1.09418660
Iteration 124, loss = 1.09407046
Iteration 125, loss = 1.09344046
Iteration 126, loss = 1.09343385
Iteration 127, loss = 1.09354410
Iteration 128, loss = 1.09356933
Iteration 129, loss = 1.09351828
Iteration 130, loss = 1.09317212
Iteration 131, loss = 1.09307412
Iteration 132, loss = 1.09284683
Iteration 133, loss = 1.09254974
Iteration 134, loss = 1.09274566
Iteration 135, loss = 1.09257867
Iteration 136, loss = 1.09232544
Iteration 137, loss = 1.09268211
Iteration 

Iteration 160, loss = 1.08692029
Iteration 161, loss = 1.08706221
Iteration 162, loss = 1.08723853
Iteration 163, loss = 1.08698372
Iteration 164, loss = 1.08653698
Iteration 165, loss = 1.08646491
Iteration 166, loss = 1.08641510
Iteration 167, loss = 1.08637365
Iteration 168, loss = 1.08614048
Iteration 169, loss = 1.08645757
Iteration 170, loss = 1.08609605
Iteration 171, loss = 1.08629680
Iteration 172, loss = 1.08619011
Iteration 173, loss = 1.08589394
Iteration 174, loss = 1.08590132
Iteration 175, loss = 1.08570346
Iteration 176, loss = 1.08532287
Iteration 177, loss = 1.08557734
Iteration 178, loss = 1.08562319
Iteration 179, loss = 1.08551304
Iteration 180, loss = 1.08514913
Iteration 181, loss = 1.08535695
Iteration 182, loss = 1.08493095
Iteration 183, loss = 1.08555448
Iteration 184, loss = 1.08489438
Iteration 185, loss = 1.08518755
Iteration 186, loss = 1.08474103
Iteration 187, loss = 1.08485851
Iteration 188, loss = 1.08510926
Iteration 189, loss = 1.08456667
Iteration 

Iteration 12, loss = 1.15350080
Iteration 13, loss = 1.15157173
Iteration 14, loss = 1.15022783
Iteration 15, loss = 1.14836635
Iteration 16, loss = 1.14656123
Iteration 17, loss = 1.14562455
Iteration 18, loss = 1.14459451
Iteration 19, loss = 1.14269420
Iteration 20, loss = 1.14195106
Iteration 21, loss = 1.14058477
Iteration 22, loss = 1.13963426
Iteration 23, loss = 1.13863575
Iteration 24, loss = 1.13816297
Iteration 25, loss = 1.13702746
Iteration 26, loss = 1.13625881
Iteration 27, loss = 1.13534838
Iteration 28, loss = 1.13515888
Iteration 29, loss = 1.13355906
Iteration 30, loss = 1.13336096
Iteration 31, loss = 1.13270323
Iteration 32, loss = 1.13212361
Iteration 33, loss = 1.13100085
Iteration 34, loss = 1.13076364
Iteration 35, loss = 1.13002688
Iteration 36, loss = 1.12955167
Iteration 37, loss = 1.12903295
Iteration 38, loss = 1.12866250
Iteration 39, loss = 1.12785624
Iteration 40, loss = 1.12737330
Iteration 41, loss = 1.12716704
Iteration 42, loss = 1.12643784
Iteratio

In [177]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=0).fit(tfidf_sent_vectors, train_labels)

predicted_labels = clf.predict(tfidf_dev_vectors)

accuracy = metrics.accuracy_score(dev_labels, predicted_labels)
print('Accuracy：', '%.4f' % accuracy)

Accuracy： 0.3431
