In [1]:
import pandas as pd
from tqdm import tqdm 
import numpy as np



train_df = pd.read_csv('../data/training.xls')
test_df = pd.read_csv('../data/development.csv')

def aggregate_users(df):
    columns_to_group_by_user = ['label', 'gender', 'profession', 'ideology_binary', 'ideology_multiclass']

    group = df.groupby(by = columns_to_group_by_user, dropna = False, observed = True, sort = False)

    # Custom df per user
    df_users = group[columns_to_group_by_user].agg(func = ['count'], as_index = False, observed = True).index.to_frame (index = False)

    merged_fields = []

    pbar = tqdm(df_users.iterrows(), total = df_users.shape[0], desc = "merging users")

    for index, row in pbar:
        df_user = df[(df['label'] == row['label'])]
        merged_fields.append({**row, **{field: ' [SEP] '.join (df_user[field].fillna ('')) for field in ['tweet']}})

    df = pd.DataFrame (merged_fields)
    return df

train_df = aggregate_users(train_df)
test_df = aggregate_users(test_df)


merging users: 100%|██████████| 314/314 [00:00<00:00, 446.37it/s]
merging users: 100%|██████████| 101/101 [00:00<00:00, 1267.77it/s]


In [19]:
corpus = []
for i in train_df.tweet:
    corpus.extend([sen.split(' ') for sen in i.split('[SEP]')])

for i in test_df.tweet:
    corpus.extend([sen.split(' ') for sen in i.split('[SEP]')])
print(len(corpus))

42730


In [60]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")

In [62]:
model.wv['la'].shape

(100,)

In [78]:
def return_wvecs(train_df):
    x_train = []
    for sen in tqdm(train_df.tweet):
        
        sens = []
        for ss in sen.split('[SEP]'):
            sens.extend(ss.split(' '))
        wvs = []
        for w in sens:
            wvs.append(model.wv[w])
        wvs = np.asarray(wvs)
        x_train.append(np.mean(wvs, axis=0))
        
    x_train = np.asarray(x_train)
    return x_train

xtrain = return_wvecs(train_df)
xtest = return_wvecs(test_df)

100%|██████████| 314/314 [00:02<00:00, 153.05it/s]
100%|██████████| 101/101 [00:00<00:00, 349.18it/s]


In [91]:
from sklearn import svm
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score

In [86]:
ytrain = train_df.gender
ytest = test_df.gender

In [89]:
m = SVC(class_weight='balanced')

m.fit(xtrain, ytrain)
predictions = m.predict(xtest)
cm =confusion_matrix(ytest, predictions)
cr = classification_report(ytest, predictions, zero_division = 0, output_dict=True)
print(cm)
print(cr['weighted avg']['f1-score'])


[[23 25]
 [13 40]]
0.6160189868865822
{'female': {'precision': 0.6388888888888888, 'recall': 0.4791666666666667, 'f1-score': 0.5476190476190476, 'support': 48}, 'male': {'precision': 0.6153846153846154, 'recall': 0.7547169811320755, 'f1-score': 0.6779661016949153, 'support': 53}, 'accuracy': 0.6237623762376238, 'macro avg': {'precision': 0.6271367521367521, 'recall': 0.6169418238993711, 'f1-score': 0.6127925746569814, 'support': 101}, 'weighted avg': {'precision': 0.6265549631886266, 'recall': 0.6237623762376238, 'f1-score': 0.6160189868865822, 'support': 101}}


In [96]:

f1s = []
for label in ['gender', 	'profession' ,	'ideology_binary', 	'ideology_multiclass']:
    ytrain = train_df[label]
    ytest = test_df[label]
    m = SVC(class_weight='balanced')

    m.fit(xtrain, ytrain)
    predictions = m.predict(xtest)
    cm =confusion_matrix(ytest, predictions)
    
    score = f1_score(ytest, predictions, average='micro')
    cr = classification_report(ytest, predictions, output_dict=True)
    f1s.append(cr['weighted avg']['f1-score'])

print(f'| w2v | {f1s[0]:.4f} | {f1s[1]:.4f} | {f1s[2]:.4f} | {f1s[3]:.4f} |')


| w2v | 0.6160 | 0.8516 | 0.7190 | 0.4978 |


|model| f1-gender | f1-profession | f1-ib | f1-im|
|---|---|---|---|---|
| w2v | 0.6160 | 0.8516 | 0.7190 | 0.4978 |


In [None]:
References:
- https://radimrehurek.com/gensim/models/word2vec.html