In [62]:
import os
import xml.etree.ElementTree as et
import pandas as pd
import numpy as np
import re

In [70]:
reg_expressions={
    'Emojis':r'[\U0001f600-\U0001f650]',
    'hashtags':r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)",
    'Mentions':r'(?:@[\w_]+)',
    'URLs':r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',
    'RTs':r"(RT|via)((?:\b\W*@\w+)+)"
}

In [71]:
labels={}
df={}
tag_df={}
for tipo in ['training','test']:
    tweets={}
    labels[tipo]={}
    for doc in os.listdir(tipo):
        if 'xml' in doc:
            with open(tipo+'/'+doc,'r') as xml_doc:
                tweets[doc.strip('.xml')]=' '.join([tweet.replace(']]></document>','') \
                                   for tweet in xml_doc.read().split('<document><![CDATA[')[1:]])\
                .replace('\n\t\t',' || ')
        else:
            with open(tipo+'/'+doc,'r') as txt_doc:
                labels[tipo]=pd.read_csv(txt_doc,sep=':::',header=None,names=['sexo','pais'],engine='python')
    
    
    df[tipo]=pd.DataFrame(list(tweets.values()),index=list(tweets.keys()),columns=['tweets'])
    

    
    tag_df[tipo]={}
    for (key,reg_ex) in reg_expressions.items():
        aux_list=[]
        for index,row in df[tipo].iterrows():
            cuenta = (len(re.findall(reg_ex, row['tweets'])))
            aux_list.append(cuenta)
        aux_list=np.array(aux_list)
        max_aux_list=aux_list.max()
        if max_aux_list>0:
            aux_list=aux_list/max_aux_list
        tag_df[tipo][key]=aux_list
    tag_df[tipo] = pd.DataFrame(tag_df[tipo],index=df[tipo].index)
    labels[tipo]=df[tipo].join(labels[tipo],how='inner')[['sexo','pais']]
    max_len=df['training']['tweets'].str.len().max()
    tag_df[tipo]['longitud']=df[tipo]['tweets'].str.len()/max_len
    print(df[tipo].shape,tag_df[tipo].shape,labels[tipo].shape)

(2800, 1) (2800, 6) (2800, 2)
(1400, 1) (1400, 6) (1400, 2)


In [72]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier

models=[
    MultinomialNB(),
    LinearSVC(),
    RandomForestClassifier(100,max_depth=25)
]

In [81]:
y_train={}
y_test={}
X_train={}
X_test={}
vectorizers={
    'sexo':TfidfVectorizer(analyzer="word", stop_words=stopwords.words('spanish'),\
                           ngram_range=(1,2), max_df=0.90, min_df=0,max_features=10000),
    'pais':TfidfVectorizer(analyzer="word",\
                           ngram_range=(1,2), max_df=0.90, min_df=0,max_features=20000)
}

for label in ['sexo','pais']:
    y_train[label]=labels['training'][label].values
    y_test[label]=labels['test'][label].values

    tfidfv=vectorizers[label]

    X_train[label]=tfidfv.fit_transform(df['training']['tweets'].values)
    X_test[label]=tfidfv.transform(df['test']['tweets'].values)

    X_train[label] = np.hstack([X_train[label].toarray(),tag_df['training'].values])
    X_test[label] = np.hstack([X_test[label].toarray(),tag_df['test'].values])
 

In [82]:
for label in ['sexo','pais']:
    for model in models:
        print(label+':')
        model.fit(X_train[label],y_train[label].ravel())
        print('\t'+str(model.__class__).split('.')[-1].split("'")[0],': '\
              ,accuracy_score(y_test[label].ravel(),model.predict(X_test[label])))

sexo:
	MultinomialNB :  0.6871428571428572
sexo:
	LinearSVC :  0.7614285714285715
sexo:
	RandomForestClassifier :  0.73
pais:
	MultinomialNB :  0.8328571428571429
pais:
	LinearSVC :  0.9335714285714286
pais:
	RandomForestClassifier :  0.93
