In [1]:
import glob
from multiprocessing import Pool
import sys
from nltk import TweetTokenizer
import nltk
import os
import re
import codecs
import preprocessor as p
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.stem.snowball import EnglishStemmer
import numpy as np

## Loading Data

In [20]:
with open("en_train/english_train.text", 'r') as f:
    en_texts = [l.strip() for l in f]

In [2]:
with open('en_train/english_train.labels', 'r') as f:
    en_labels = [int(l.strip()) for l in f]

In [43]:
with open('test/english_test.text', 'r') as f:
    en_trial = [l.strip() for l in f]

In [3]:
with open('test/english_test.labels', 'r') as f:
    en_trial_labels = [int(l.strip()) for l in f]

In [37]:
with open("es_train/spanish_train.text", 'r') as f:
    es_texts = [l.strip() for l in f]

In [4]:
with open("es_train/spanish_train.labels", 'r') as f:
    es_labels = [int(l.strip()) for l in f]

In [45]:
with open('test/spanish_test.text', 'r') as f:
    es_trial = [l.strip() for l in f]

In [76]:
with open('test/spanish_test.labels', 'r') as f:
    es_trial_labels = [int(l.strip()) for l in f]

In [7]:
with open("mapping/english_mapping.txt", 'r') as f:
    en_maps = [l.strip().split() for l in f]

In [8]:
with open("mapping/spanish_mapping.txt", 'r') as f:
    es_maps = [l.strip().split() for l in f]

In [2]:
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))','<url>',tweet)
    tweet = re.sub('(\@[^\s]+)','<user>',tweet)
    try:
        tweet = tweet.decode('unicode_escape').encode('ascii','ignore')
    except:
        pass
    return tweet

In [10]:
def tokenize_tweets(filename, dest_folder):
    basename = os.path.basename(filename)
    dest = os.path.join(dest_folder, basename + '.tok')
    print("processing %s" % basename)
    tknzr = TweetTokenizer()
    with codecs.open(dest, 'w', "utf-8") as out_fs:
        with open(filename, 'r', encoding="utf-8") as in_fs:
            for line in in_fs:
#                 try:
#                     language, id, timestamp, username, tweet = line.strip().split('\t')
#                 except:
#                     print("could not parse line.")
#                     continue
#                 if language != 'en':
#                     continue
                tweet = tknzr.tokenize(line)
#                 if not 6 < len(tweet) < 110:
#                     continue
                tweet = preprocess_tweet(' '.join(tweet))
                out_fs.write(tweet+'\n')

In [18]:
p.set_options(p.OPT.URL, p.OPT.NUMBER, p.OPT.MENTION)
translator = str.maketrans("", "", punctuation)

In [165]:
def preproc_eng(texts):
    clear_texts = []
    for text in texts:
        # delete stop-words
        text = ' '.join([word for word in text.split() if word not in (stopwords.words('english'))])
        print(text)
        # delete punctuation
        text = word_tokenize(text.translate(translator))     
        # stemming
#         text = [stemmer.stem(w) for w in text]
        # preprocessing as tweet
        text = p.clean(' '.join(text))
        clear_texts.append(text)
    return clear_texts

In [49]:
def preproc_es(texts):
    clear_texts = []
    for text in texts:
        # delete stop-words
        text = ' '.join([word for word in text.split() if word not in (stopwords.words('spanish'))])
        # delete punctuation
        text = word_tokenize(text.translate(translator))
        # stemming
#         text = [stemmer.stem(w) for w in text]
        # preprocessing as tweet
        text = p.clean(' '.join(text))
        clear_texts.append(text)
    return clear_texts

In [24]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/weihaoran/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [26]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/weihaoran/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [166]:
texts_clear = preproc_eng(texts)

In [47]:
trial_clear = preproc_eng(en_trial)

In [50]:
es_text_clear = preproc_es(es_texts)

In [55]:
es_trial_clear = preproc_es(es_trial)

In [5]:
with open("en_tokenized/clear_Eng_train.txt",'r') as f:
    texts_clear = [l.strip() for l in f]

In [6]:
with open("en_tokenized/clear_Eng_test.txt",'r') as f:
    trial_clear = [l.strip() for l in f]

In [7]:
with open("es_tokenized/clear_Es_train.txt",'r') as f:
    es_text_clear = [l.strip() for l in f]

In [8]:
with open("es_tokenized/clear_Es_test.txt",'r') as f:
    es_trial_clear = [l.strip() for l in f]

In [204]:
## get ride of odd symbol
for i in range(len(texts_clear)):
    texts_clear[i] = re.sub('[…·•’”“—➵・]', '', texts_clear[i])
    texts_clear[i] = texts_clear[i].lower()

In [205]:

for i in range(len(trial_clear)):
    trial_clear[i] = re.sub('[…·•’”“—➵・]', '', trial_clear[i]) 
    trial_clear[i] = trial_clear[i].lower()

In [234]:
for i in range(len(es_text_clear)):
    es_text_clear[i] = re.sub('[…·•’”“—➵・]', '', es_text_clear[i]) 
    es_text_clear[i] = es_text_clear[i].lower()

In [235]:
for i in range(len(es_trial_clear)):
    es_trial_clear[i] = re.sub('[…·•’”“—➵・]', '', es_trial_clear[i]) 
    es_trial_clear[i] = es_trial_clear[i].lower()

In [236]:
es_text_clear

['ahaa brevard county florida',
 'sleek black user toronto ontario',
 'sunny  s face asking me mean leaving abuela abuela ️',
 'with amy years ago tbt philadelphia museum art',
 'you know picked good husband brings slurpee blessed',
 'welsh siblings take unc university northern colorado unc',
 'i tv today lakers media day season im the',
 'new vlog chiefobivlogs this one i made jidennas the let out ft',
 'womancrushwednesday woman heart amp soul amp i thankful every moment have',
 'family golf outing user user user',
 'drink coffee make up girl just don  t push off',
 ' mixtape coming soon  user university windsor',
 'puppy love scooby parchita ️ mybabyisinlove love central park manhatan',
 'thanks tatianna coming ️ ️ ️ hair mywork woman blowdry haircut hairstylist',
 'the new omid salon cool shoreh cooler omidsalon shorehshoreh',
 'when highlight day kiss mom ️',
 'last nights memories captured escape fate i prevail',
 'when girl cant reach top shelf got tall guy loudoun county virgin

### Vocabulary Creation

In [10]:
vocab = set()
total_vocab = texts_clear + trial_clear
for line in total_vocab:
    for word in line.split():
        vocab.add(word)

In [19]:
vocab_lst = list(vocab)

In [11]:
es_vocab = set()
total_es = es_text_clear + es_trial_clear
for line in total_es:
    for word in line.split():
        es_vocab.add(word)

In [12]:
en_sentences = [sent.split() for sent in texts_clear + trial_clear]

In [13]:
len(en_sentences)

100000

In [14]:
len(vocab)

103027

In [15]:
len(es_vocab)

38603

In [237]:
with codecs.open("en_tokenized/clear_Eng_train.txt",'w',"utf-8") as out_fs:
    for each in texts_clear:
        out_fs.write(each + "\n")

In [238]:
with codecs.open("es_tokenized/clear_Es_train.txt",'w',"utf-8") as out_fs:
    for each in es_text_clear:
        out_fs.write(each + "\n")

In [242]:
with codecs.open("es_tokenized/clear_Es_test.txt",'w',"utf-8") as out_fs:
    for each in es_trial_clear:
        out_fs.write(each + "\n")

In [240]:
with codecs.open("en_tokenized/clear_Eng_test.txt",'w',"utf-8") as out_fs:
    for each in trial_clear:
        out_fs.write(each + "\n")

## Sentence Vectorization

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

My baseline here is BagOfWords model and Tf-idf model.

### BagOfWords

#### English

In [228]:
cv_ = CountVectorizer(ngram_range=(-2,2), vocabulary=vocab_lst, min_df= 5)

Split train and dev dataset

In [229]:
en_train, en_dev = train_test_split(texts_clear, test_size = 0.2, shuffle=False)
en_y, en_dev_y = train_test_split(en_labels, test_size = 0.2, shuffle=False)

In [225]:
cv_.fit(raw_documents=vocab_lst)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [231]:
en_train_cv = cv_.fit_transform(en_train)
en_dev_cv = cv_.transform(en_dev)

In [232]:
en_train_cv

<72000x103027 sparse matrix of type '<class 'numpy.int64'>'
	with 546600 stored elements in Compressed Sparse Row format>

In [277]:
lr = LogisticRegression(random_state=14, dual = True)

In [278]:
lr.fit(en_train_cv, en_y)



LogisticRegression(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=14, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [279]:
en_pred_dev = lr.predict(en_dev_cv)

In [280]:
en_pred_dev.dtype = np.int

In [281]:
np.savetxt('res_cv.txt', en_pred_dev, fmt='%d')
np.savetxt('goldres_cv.txt', np.array(en_dev_y), fmt='%d')

In [282]:
%run scorer_semeval18.py goldres_cv.txt res_cv.txt

Macro F-Score (official): 21.256
-----
Micro F-Score: 31.222
Precision: 31.222
Recall: 31.222


#### Spanish

In [295]:
cv_es = CountVectorizer(ngram_range=(-2,2), vocabulary=es_vocab)

In [296]:
es_train, es_dev = train_test_split(es_text_clear, test_size = 0.2, shuffle=False)
es_y, es_dev_y = train_test_split(es_labels, test_size = 0.2, shuffle=False)

In [297]:
es_train_cv = cv_es.fit_transform(es_train)
es_dev_cv = cv_es.transform(es_dev)

In [310]:
Cs = np.linspace(1,100,11)

In [311]:
for C in Cs:
    print("C: " , C )
    lr = LogisticRegression(C=C, dual= True, random_state=14)
    lr.fit(es_train_tf, es_y)
    es_pred_train = lr.predict(es_dev_tf)
    es_pred_train.dtype = np.int
    np.savetxt('res.txt', es_pred_train, fmt='%d')
    np.savetxt('goldres.txt', np.array(es_dev_y), fmt='%d')
    %run scorer_semeval18.py goldres.txt res.txt
    print()

C:  1.0


  y = column_or_1d(y, warn=True)


Macro F-Score (official): 7.665
-----
Micro F-Score: 28.237
Precision: 28.237
Recall: 28.237

C:  10.9
Macro F-Score (official): 10.62
-----
Micro F-Score: 24.211
Precision: 24.211
Recall: 24.211

C:  20.8




Macro F-Score (official): 10.824
-----
Micro F-Score: 23.368
Precision: 23.368
Recall: 23.368

C:  30.700000000000003
Macro F-Score (official): 10.853
-----
Micro F-Score: 23.026
Precision: 23.026
Recall: 23.026

C:  40.6
Macro F-Score (official): 10.831
-----
Micro F-Score: 22.526
Precision: 22.526
Recall: 22.526

C:  50.5
Macro F-Score (official): 10.69
-----
Micro F-Score: 22.184
Precision: 22.184
Recall: 22.184

C:  60.400000000000006
Macro F-Score (official): 10.791
-----
Micro F-Score: 22.184
Precision: 22.184
Recall: 22.184

C:  70.3
Macro F-Score (official): 10.682
-----
Micro F-Score: 22.053
Precision: 22.053
Recall: 22.053

C:  80.2
Macro F-Score (official): 10.751
-----
Micro F-Score: 22.105
Precision: 22.105
Recall: 22.105

C:  90.10000000000001
Macro F-Score (official): 10.649
-----
Micro F-Score: 21.921
Precision: 21.921
Recall: 21.921

C:  100.0
Macro F-Score (official): 10.637
-----
Micro F-Score: 21.763
Precision: 21.763
Recall: 21.763



In [312]:
lr = LogisticRegression(C=20.0,random_state=14, dual = True)

In [313]:
lr.fit(es_train_cv, es_y)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=20.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=14, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [314]:
es_pred_dev = lr.predict(es_dev_cv)

In [315]:
es_pred_dev.dtype = np.int

In [316]:
np.savetxt('res_cv_es.txt', es_pred_dev, fmt='%d')
np.savetxt('goldres_cv.txt', np.array(es_dev_y), fmt='%d')
%run scorer_semeval18.py goldres_cv.txt res_cv_es.txt

Macro F-Score (official): 10.986
-----
Micro F-Score: 22.553
Precision: 22.553
Recall: 22.553


In [317]:
es_texts_cv = cv_es.fit_transform(es_text_clear)
es_test_cv = cv_es.transform(es_trial_clear)

In [318]:
lr = LogisticRegression(C=20.0,random_state=14, dual = True)

In [319]:
lr.fit(es_texts_cv, es_labels)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=20.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=14, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [320]:
es_pred = lr.predict(es_dev_cv)

In [321]:
es_pred.dtype = np.int

In [322]:
np.savetxt('res_cv_es.txt', es_pred_dev, fmt='%d')
np.savetxt('goldres_cv.txt', np.array(es_pred), fmt='%d')
%run scorer_semeval18.py goldres_cv.txt res_cv_es.txt

Macro F-Score (official): 11.573
-----
Micro F-Score: 23.289
Precision: 23.289
Recall: 23.289


### Tf-idf

#### English

In [124]:
tf = TfidfVectorizer()

In [125]:
en_train_tf = tf.fit_transform(en_train)
en_dev_tf = tf.transform(en_dev)

In [141]:
Cs = np.logspace(0,1.5,20)

In [142]:
for C in Cs:
    print("C: " , C )
    lr = LogisticRegression(C=C, dual= True, random_state=14)
    lr.fit(en_train_tf, en_y)
    en_pred_train = lr.predict(en_dev_tf)
    en_pred_train.dtype = np.int
    np.savetxt('res.txt', en_pred_train, fmt='%d')
    np.savetxt('goldres.txt', np.array(en_dev_y), fmt='%d')
    %run scorer_semeval18.py goldres.txt res.txt
    print()

C:  1.0




Macro F-Score (official): 19.725
-----
Micro F-Score: 32.461
Precision: 32.461
Recall: 32.461

C:  1.1993539462092342
Macro F-Score (official): 20.046
-----
Micro F-Score: 32.533
Precision: 32.533
Recall: 32.533

C:  1.4384498882876628
Macro F-Score (official): 20.349
-----
Micro F-Score: 32.483
Precision: 32.483
Recall: 32.483

C:  1.7252105499420405
Macro F-Score (official): 20.83
-----
Micro F-Score: 32.55
Precision: 32.55
Recall: 32.55

C:  2.0691380811147897
Macro F-Score (official): 21.066
-----
Micro F-Score: 32.45
Precision: 32.45
Recall: 32.45

C:  2.481628922836826
Macro F-Score (official): 21.256
-----
Micro F-Score: 32.333
Precision: 32.333
Recall: 32.333

C:  2.976351441631318
Macro F-Score (official): 21.313
-----
Micro F-Score: 31.989
Precision: 31.989
Recall: 31.989

C:  3.5696988468260646
Macro F-Score (official): 21.571
-----
Micro F-Score: 31.778
Precision: 31.778
Recall: 31.778

C:  4.281332398719393
Macro F-Score (official): 21.752
-----
Micro F-Score: 31.533
Preci



Macro F-Score (official): 20.873
-----
Micro F-Score: 28.211
Precision: 28.211
Recall: 28.211

C:  21.983926488622892
Macro F-Score (official): 20.593
-----
Micro F-Score: 27.7
Precision: 27.7
Recall: 27.7

C:  26.366508987303583
Macro F-Score (official): 20.277
-----
Micro F-Score: 27.2
Precision: 27.2
Recall: 27.2

C:  31.622776601683793
Macro F-Score (official): 20.071
-----
Micro F-Score: 26.8
Precision: 26.8
Recall: 26.8



In [143]:
lr = LogisticRegression(C=4, dual= True, random_state=14)

In [144]:
lr.fit(en_train_tf, en_y)



LogisticRegression(C=4, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=14, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [145]:
en_pred_train = lr.predict(en_dev_tf)
en_pred_train.dtype = np.int
np.savetxt('res.txt', en_pred_train, fmt='%d')
np.savetxt('goldres.txt', np.array(en_dev_y), fmt='%d')

In [152]:
lr = LogisticRegression(C=10.0, dual= True, solver="liblinear", random_state=14)

In [153]:
en_train_clear = tf.fit_transform(texts_clear)
en_test_clear = tf.transform(trial_clear)


In [154]:
lr.fit(en_train_clear, en_labels)



LogisticRegression(C=10.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=14, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [155]:
res_en = lr.predict(en_test_clear)

In [156]:
res_en.dtype = np.int

In [157]:
np.savetxt("res.txt", res_en, fmt="%d")

In [158]:
%run scorer_semeval18.py test/english_test.labels res.txt

Macro F-Score (official): 21.684
-----
Micro F-Score: 30.52
Precision: 30.52
Recall: 30.52


#### Spanish

In [323]:
es_labels = np.array(es_labels).astype(int)

In [324]:
es_train, es_dev = train_test_split(es_text_clear, test_size = 0.2, shuffle=False)
es_y, es_dev_y = train_test_split(es_labels, test_size = 0.2, shuffle=False)

In [325]:
tf_es = TfidfVectorizer(ngram_range=(-2,2))

In [326]:
es_train_tf = tf.fit_transform(es_train)
es_dev_tf = tf.transform(es_dev)

In [327]:
for C in Cs:
    print("C: " , C )
    lr = LogisticRegression(C=C, dual= True, random_state=14)
    lr.fit(es_train_tf, es_y)
    es_pred_train = lr.predict(es_dev_tf)
    es_pred_train.dtype = np.int
    np.savetxt('res_es.txt', es_pred_train.astype(int), fmt='%d')
    np.savetxt('goldres_es.txt', np.array(es_dev_y).astype(int), fmt='%d')
    %run scorer_semeval18.py goldres_es.txt res_es.txt
    print()

C:  1.0


  y = column_or_1d(y, warn=True)


Macro F-Score (official): 7.665
-----
Micro F-Score: 28.237
Precision: 28.237
Recall: 28.237

C:  10.9
Macro F-Score (official): 10.62
-----
Micro F-Score: 24.211
Precision: 24.211
Recall: 24.211

C:  20.8




Macro F-Score (official): 10.824
-----
Micro F-Score: 23.368
Precision: 23.368
Recall: 23.368

C:  30.700000000000003
Macro F-Score (official): 10.853
-----
Micro F-Score: 23.026
Precision: 23.026
Recall: 23.026

C:  40.6
Macro F-Score (official): 10.831
-----
Micro F-Score: 22.526
Precision: 22.526
Recall: 22.526

C:  50.5
Macro F-Score (official): 10.69
-----
Micro F-Score: 22.184
Precision: 22.184
Recall: 22.184

C:  60.400000000000006
Macro F-Score (official): 10.791
-----
Micro F-Score: 22.184
Precision: 22.184
Recall: 22.184

C:  70.3
Macro F-Score (official): 10.682
-----
Micro F-Score: 22.053
Precision: 22.053
Recall: 22.053

C:  80.2
Macro F-Score (official): 10.751
-----
Micro F-Score: 22.105
Precision: 22.105
Recall: 22.105

C:  90.10000000000001
Macro F-Score (official): 10.649
-----
Micro F-Score: 21.921
Precision: 21.921
Recall: 21.921

C:  100.0
Macro F-Score (official): 10.637
-----
Micro F-Score: 21.763
Precision: 21.763
Recall: 21.763



In [328]:
es_texts_tf = tf_es.fit_transform(es_text_clear)
es_test_tf = tf_es.transform(es_trial_clear)

In [329]:
lr = LogisticRegression(C=26.4, dual= True, random_state=14)
lr.fit(es_texts_tf, es_labels)
pred_es = lr.predict(es_test_tf)
pred_es = np.array(pred_es).astype(int)

  y = column_or_1d(y, warn=True)


In [330]:
len(pred_es)

1000

In [331]:
es_test_tf.shape

(1000, 192725)

In [332]:
np.savetxt("res_es.txt", pred_es, fmt="%d")
%run scorer_semeval18.py test/spanish_test.labels res_es.txt

Macro F-Score (official): 12.207
-----
Micro F-Score: 30.2
Precision: 30.2
Recall: 30.2


### Word Embeddings

#### Word2Vec

English

In [10]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
import io
from sklearn.preprocessing import StandardScaler

In [11]:
import gensim.models

In [12]:
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    count = 0
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = [float(x) for x in tokens[1:]]
        count+=1
        if count == 50000:
            break
    return data

In [16]:
fastText = load_vectors("wiki-news-300d-1M.vec")

In [18]:
fastText["second"]

<map at 0x1171bb1d0>

In [26]:
path = get_tmpfile("word2vec.model")
model = Word2Vec(en_sentences, size=100, window=5, min_count=1, sample=1e-3,workers=2)

In [27]:
model.train(en_sentences, total_words=len(vocab_lst), epochs=20)

(15324850, 16578220)

In [54]:
model.save("tweet2vec.w2c")

In [30]:
senV_en = []
for line in texts_clear:
    temp_w2v = np.zeros(100)
    counts = 0 
    for word in line.split():
        temp_w2v += model.wv[word]
        counts += 1
    temp_w2v /= counts
    senV_en.append(temp_w2v)

In [33]:
senV_en

[array([-0.93428818,  0.29945133, -0.00500619,  0.81770235, -1.45392341,
        -0.33635767,  0.39103316,  0.92272874,  0.14694869,  0.49653533,
         0.26332708,  0.76851252, -0.40099237, -0.75203834, -0.76111025,
         0.62939381,  1.20150855, -0.205779  ,  0.08593758,  0.1040345 ,
        -0.51608799, -1.47416687, -0.47720156, -0.39639996, -0.50711154,
         0.18870477,  0.09484901,  1.20087672,  1.77495759,  0.36246818,
         0.19212598, -0.16428066,  0.89555997, -0.50110189,  0.55472901,
        -0.86504888,  0.83543982,  0.14628219, -0.66864462,  0.46082326,
        -0.23160619, -0.57493788,  0.14308518, -0.66977713, -0.17635516,
         0.53488914,  0.73145596, -0.64423217,  0.20511422,  0.91486281,
         1.2850915 ,  0.14839674,  0.11599278, -0.23310335, -0.62873577,
         0.19271264,  0.29692671, -0.45767889, -0.32491353, -2.13947301,
         1.17824134, -0.03923354,  1.12067916, -1.27380618,  0.27961445,
        -0.25950942,  0.02867444,  0.49264376, -0.6

In [31]:
senV_en_trial = []
for line in trial_clear:
    temp_w2v = np.zeros(100)
    counts = 0 
    for word in line.split():
        temp_w2v += model.wv[word]
        counts += 1
    temp_w2v /= counts
    senV_en_trial.append(temp_w2v)

In [43]:
np.savetxt("senV_en_trial.txt", senV_en_trial)

In [41]:
sc = StandardScaler()

In [44]:
sc.fit(senV_en)
senV_en = sc.transform(senV_en)
senV_en_trial = sc.transform(senV_en_trial)

In [45]:
lr = LogisticRegression(C=10, dual= True, random_state=14)
lr.fit(senV_en, en_labels )
pred_en = lr.predict(senV_en_trial)
pred_en = np.array(pred_en).astype(int)



In [46]:
pred_en

array([0, 7, 0, ..., 1, 0, 4])

In [47]:
len(pred_en)

10000

In [48]:
np.savetxt("res_en_senV.txt", pred_es, fmt="%d")
%run scorer_semeval18.py test/english_test.labels res_en_senV.txt

Macro F-Score (official): 14.844
-----
Micro F-Score: 33.03
Precision: 33.03
Recall: 33.03


spanish

In [13]:
es_fast = load_vectors("wiki.es.vec")

In [14]:
len(es_fast['la'])

300

In [15]:
senV_es = []
for line in es_text_clear:
    temp_w2v = np.zeros(300)
    counts = 0 
    for word in line.split():
        if es_fast.get(word) is None:
            continue
        temp_w2v +=es_fast[word]
        counts += 1
    if counts != 0:
        temp_w2v /= counts
    senV_es.append(temp_w2v)

In [17]:
senV_es_trial = []
for line in es_trial_clear:
    temp_w2v = np.zeros(300)
    counts = 0 
    for word in line.split():
        if es_fast.get(word) is None:
            continue
        temp_w2v +=es_fast[word]
        counts += 1
    if counts != 0:
        temp_w2v /= counts
    senV_es_trial.append(temp_w2v)

In [18]:
sc = StandardScaler()

In [19]:
sc.fit(senV_es)
senV_es = sc.transform(senV_es)
senV_es_trial = sc.transform(senV_es_trial)

In [21]:
lr = LogisticRegression(C=10,solver='saga',multi_class='auto', random_state=14)
lr.fit(senV_es, es_labels )
pred_es = lr.predict(senV_es_trial)
pred_es = np.array(pred_es).astype(int)



In [22]:
np.savetxt("res_es_senV.txt", pred_es, fmt="%d")
%run scorer_semeval18.py test/spanish_test.labels res_es_senV.txt

Macro F-Score (official): 9.766
-----
Micro F-Score: 29.7
Precision: 29.7
Recall: 29.7


## Model Selection

SVM

In [85]:
from sklearn.preprocessing import Normalizer
from sklearn.svm import SVC

In [86]:
tf = TfidfVectorizer(ngram_range=(-2,2))
en_train_clear = tf.fit_transform(texts_clear)
en_test_clear = tf.transform(trial_clear)

Scaling 

In [87]:
nm = Normalizer().fit(en_train_clear)
svm_en_train = nm.transform(en_train_clear)
svm_en_trial = nm.transform(en_test_clear)

In [88]:
svm = SVC(kernel="linear", C=10,random_state=14)

In [89]:
svm.fit(en_train_clear, en_labels)



SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=14,
  shrinking=True, tol=0.001, verbose=False)

In [90]:
svm_preds = svm.predict(en_test_clear)

In [91]:
np.savetxt('res_svm.txt', svm_preds, fmt='%d')
%run scorer_semeval18.py test/english_test.labels res_svm.txt

Macro F-Score (official): 1.79
-----
Micro F-Score: 21.8
Precision: 21.8
Recall: 21.8


Spanish

In [92]:
tf = TfidfVectorizer(ngram_range=(-2,2))
es_train_clear = tf.fit_transform(es_text_clear)
es_test_clear = tf.transform(es_trial_clear)

In [93]:
nm = Normalizer().fit(es_train_clear)
svm_es_train = nm.transform(es_train_clear)
svm_es_trial = nm.transform(es_test_clear)

In [104]:
svm = SVC(kernel="linear", C=30,random_state=14, gamma='scale')

In [105]:
svm.fit(es_train_clear, es_labels)

SVC(C=30, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
  max_iter=-1, probability=False, random_state=14, shrinking=True,
  tol=0.001, verbose=False)

In [106]:
svm_preds = svm.predict(es_test_clear)

In [107]:
np.savetxt('res_svm.txt', svm_preds, fmt='%d')
%run scorer_semeval18.py test/spanish_test.labels res_svm.txt

Macro F-Score (official): 12.232
-----
Micro F-Score: 30.7
Precision: 30.7
Recall: 30.7
