In [5]:
import pandas as pd
from tqdm import tqdm 
import numpy as np



train_df = pd.read_csv('../data/training.xls')
test_df = pd.read_csv('../data/development.csv')

def aggregate_users(df):
    columns_to_group_by_user = ['label', 'gender', 'profession', 'ideology_binary', 'ideology_multiclass']

    group = df.groupby(by = columns_to_group_by_user, dropna = False, observed = True, sort = False)

    # Custom df per user
    df_users = group[columns_to_group_by_user].agg(func = ['count'], as_index = False, observed = True).index.to_frame (index = False)

    merged_fields = []

    pbar = tqdm(df_users.iterrows(), total = df_users.shape[0], desc = "merging users")

    for index, row in pbar:
        df_user = df[(df['label'] == row['label'])]
        merged_fields.append({**row, **{field: ' [SEP] '.join (df_user[field].fillna ('')) for field in ['tweet']}})

    df = pd.DataFrame (merged_fields)
    return df

train_df = aggregate_users(train_df)
test_df = aggregate_users(test_df)


merging users: 100%|██████████| 314/314 [00:00<00:00, 469.16it/s]
merging users: 100%|██████████| 101/101 [00:00<00:00, 1336.40it/s]


In [1]:
!pip install laserembeddings

Collecting laserembeddings
  Downloading laserembeddings-1.1.2-py3-none-any.whl (13 kB)
Collecting subword-nmt<0.4.0,>=0.3.6
  Downloading subword_nmt-0.3.8-py3-none-any.whl (27 kB)
Collecting transliterate==1.10.2
  Downloading transliterate-1.10.2-py2.py3-none-any.whl (45 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 KB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting sacremoses==0.0.35
  Downloading sacremoses-0.0.35.tar.gz (859 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m859.8/859.8 KB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting mock
  Downloading mock-4.0.3-py3-none-any.whl (28 kB)
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25ldone
[?25h  Created wheel for sacremoses: filename=sacremoses-0.0.35-py3-none-any.whl size=883989 sha256=ff7bbb80df

In [3]:
!python -m laserembeddings download-models

Downloading models into /home/amansinha/venv/global_env/lib/python3.8/site-packages/laserembeddings/data

✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fcodes    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fvocab    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/bilstm.93langs.2018-12-26.pt    

✨ You're all set!


In [4]:
from laserembeddings import Laser

laser = Laser()

# if all sentences are in the same language:

embeddings = laser.embed_sentences(
    ['let your neural network be polyglot',
     'use multilingual embeddings!'],
    lang='es')

embeddings

array([[0.00156426, 0.01067378, 0.00388247, ..., 0.021797  , 0.0062856 ,
        0.01434554],
       [0.01456157, 0.00154333, 0.00090701, ..., 0.01484861, 0.00712931,
        0.0476219 ]], dtype=float32)

In [13]:
x_train = []
for sen in tqdm(train_df.tweet):
        
    embeds = laser.embed_sentences(sen.split('[SEP]')
                            ,lang='es')
    
    
    x_train.append(np.mean(embeds,axis=0))
x_train = np.asarray(x_train)
print(x_train.shape)
# save the embeddings, because it will take time to get them everytime
np.save('xtrain_raw_laser.npy',x_train)

100%|██████████| 314/314 [20:47<00:00,  3.97s/it]

(314, 1024)





In [17]:
def return_wvecs(train_df):
    x_train = []
    for sen in tqdm(train_df.tweet):
        
        embeds = laser.embed_sentences(sen.split('[SEP]')
                            ,lang='es')
        x_train.append(np.mean(embeds,axis=0))
        
    x_train = np.asarray(x_train)
    return x_train

#xtrain = return_wvecs(train_df)
xtrain = np.load('xtrain_raw_laser.npy')
xtest = return_wvecs(test_df)
np.save('xtest_raw_laser.npy',x_train)

100%|██████████| 101/101 [03:50<00:00,  2.28s/it]


In [18]:
from sklearn import svm
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score

In [19]:
ytrain = train_df.gender
ytest = test_df.gender

In [20]:
m = SVC(class_weight='balanced')

m.fit(xtrain, ytrain)
predictions = m.predict(xtest)
cm =confusion_matrix(ytest, predictions)
cr = classification_report(ytest, predictions, zero_division = 0, output_dict=True)
print(cm)
print(cr['weighted avg']['f1-score'])


[[38 10]
 [17 36]]
0.7323062403327711


In [21]:

f1s = []
for label in ['gender', 	'profession' ,	'ideology_binary', 	'ideology_multiclass']:
    ytrain = train_df[label]
    ytest = test_df[label]
    m = SVC(class_weight='balanced')

    m.fit(xtrain, ytrain)
    predictions = m.predict(xtest)
    cm =confusion_matrix(ytest, predictions)
    
    score = f1_score(ytest, predictions, average='micro')
    cr = classification_report(ytest, predictions, output_dict=True)
    f1s.append(cr['weighted avg']['f1-score'])

print(f'| laser | {f1s[0]:.4f} | {f1s[1]:.4f} | {f1s[2]:.4f} | {f1s[3]:.4f} |')


| laser | 0.7323 | 0.8784 | 0.8812 | 0.7371 |


|model| f1-gender | f1-profession | f1-ib | f1-im|
|---|---|---|---|---|
| laser | 0.7323 | 0.8784 | 0.8812 | 0.7371 |


In [None]:
References : 
    
- https://github.com/yannvgn/laserembeddings