In [21]:
import pandas as pd
from tqdm import tqdm 
import numpy as np



In [32]:
train_df = pd.read_csv('../data/training.xls')
test_df = pd.read_csv('../data/development.csv')

def aggregate_users(df):
    columns_to_group_by_user = ['label', 'gender', 'profession', 'ideology_binary', 'ideology_multiclass']

    group = df.groupby(by = columns_to_group_by_user, dropna = False, observed = True, sort = False)

    # Custom df per user
    df_users = group[columns_to_group_by_user].agg(func = ['count'], as_index = False, observed = True).index.to_frame (index = False)

    merged_fields = []

    pbar = tqdm(df_users.iterrows(), total = df_users.shape[0], desc = "merging users")

    for index, row in pbar:
        df_user = df[(df['label'] == row['label'])]
        merged_fields.append({**row, **{field: ' [SEP] '.join (df_user[field].fillna ('')) for field in ['tweet']}})

    df = pd.DataFrame (merged_fields)
    return df

train_df = aggregate_users(train_df)
test_df = aggregate_users(test_df)


merging users: 100%|██████████| 314/314 [00:03<00:00, 95.51it/s] 
merging users: 100%|██████████| 101/101 [00:00<00:00, 241.25it/s]


In [18]:
test_df

Unnamed: 0,label,gender,profession,ideology_binary,ideology_multiclass,tweet
0,@user106,male,politician,right,moderate_right,@user Sobre su estado y el gasto telefónico no...
1,@user180,male,journalist,right,moderate_right,Gracias a Dios que sale @user en la foto. En r...
2,@user226,female,politician,right,moderate_right,"No, a ver, q la chica era encantadora. Solo di..."
3,@user23,female,politician,left,moderate_left,El TC alemán pide aprobar una ley para fijar u...
4,@user237,male,journalist,right,moderate_right,He criticado a De Quintos por fichar por [POLI...
5,@user250,male,politician,left,moderate_left,Casado ha dicho que @user es un presidente ile...
6,@user280,male,politician,left,moderate_left,"Hace cuatro años a esta misma hora, sabíamos e..."
7,@user295,male,politician,left,moderate_left,La crispación política del Congreso no es más ...
8,@user332,male,politician,right,right,"[POLITICAL_PARTY], [POLITICAL_PARTY] y C’s fir..."
9,@user334,female,politician,right,right,El trizquierdito [POLITICAL_PARTY]-PODEMOS-[PO...


In [15]:
#!pip install stop-words
from stop_words import get_stop_words
stop_words = get_stop_words('spanish')

In [72]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# create a bag of words for only unigrams
#cv = CountVectorizer(analyzer = 'word',ngram_range=(1,3), stop_words=stop_words)
tfidf = TfidfVectorizer(analyzer = 'word',ngram_range=(1,3), stop_words=stop_words)

X_train_cv = tfidf.fit_transform(train_df.tweet)
X_test_cv = tfidf.transform(test_df.tweet)

In [71]:
tfidf.get_feature_names_out()

AttributeError: 'TfidfVectorizer' object has no attribute 'get_feature_names_out'

In [73]:
# import libraries
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, classification_report

# train naive bayes classifier
fname = 'tfidf + 1-3gram'

f1s = []
for label in ['gender', 	'profession' ,	'ideology_binary', 	'ideology_multiclass']:
    y_train = train_df[label]
    y_test = test_df[label]
    clf = MultinomialNB()
    clf.fit(X_train_cv, y_train)
    # create predictions
    y_pred = clf.predict(X_test_cv)
    
    # find f-1 score
    score = f1_score(y_test, y_pred, average='micro')
    #print(f'{label} : F-1 score : {np.round(score,4)}')
    cr = classification_report(y_test, y_pred, output_dict=True)
    f1s.append(cr['weighted avg']['f1-score'])

print(f'{fname} | {f1s[0]:.4f} | {f1s[1]:.4f} | {f1s[2]:.4f} | {f1s[3]:.4f}')

#print('classification report: \n {}')

  _warn_prf(average, modifier, msg_start, len(result))


tfidf + 1-3gram | 0.8142 | 0.7972 | 0.8459 | 0.7568


  _warn_prf(average, modifier, msg_start, len(result))


|model| f1-gender | f1-profession | f1-ib | f1-im|
|---|---|---|---|---|
NB + 1gram | 0.9802 | 0.9902 | 1.0000 | 0.9796
NB + 2gram | 1.0000 | 0.9902 | 1.0000 | 1.0000
NB + 3gram | 1.0000 | 0.9902 | 1.0000 | 1.0000
NB + 1-3gram | 1.0000 | 0.9902 | 1.0000 | 1.0000
tfidf + 1gram | 0.3612 | 0.7972 | 0.4473 | 0.6923
tfidf + 2gram | 1.0000 | 0.7972 | 1.0000 | 0.8193
tfidf + 3gram | 1.0000 | 0.7972 | 1.0000 | 0.8319
tfidf + 1-3gram | 0.8142 | 0.7972 | 0.8459 | 0.7568
