In [1]:
import numpy as np
import pandas as pd

In [2]:
# Set ipython's max row display
pd.set_option('display.max_row', 1000)

# Set iPython's max column width to 50
pd.set_option('display.max_columns', 50)

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

def feature_extraction(dataframe, features):    
    X_esc = dataframe.drop(columns=['class','id','annotator'])
    Y = dataframe['class']    

    #Entrenamiento del Modelo
    predictor = RandomForestClassifier(n_estimators = 100, n_jobs=-1)
    predictor.fit(X_esc, Y)
    
    #Dataset with selected features
    model = SelectFromModel(predictor, prefit = True, max_features=features)
    X_new = model.transform(X_esc) 
    
    feature_idx = model.get_support()
    feature_name = X_esc.columns[feature_idx]
    
    X_filtered = pd.DataFrame(X_new, columns=feature_name)
    X_filtered['id'] = dataframe['id']
    X_filtered['class'] = dataframe['class']
    X_filtered['annotator'] = dataframe['annotator']
    return X_filtered

In [4]:
file_name = 'sentiment_polarity/polarity_gold_lsa_topics'
extension = '.csv'
df_gold = pd.read_csv(file_name + extension, sep=',')
df_gold.head()

Unnamed: 0,id,TOPIC0,TOPIC1,TOPIC2,TOPIC3,TOPIC4,TOPIC5,TOPIC6,TOPIC7,TOPIC8,TOPIC9,TOPIC10,TOPIC11,TOPIC12,TOPIC13,TOPIC14,TOPIC15,TOPIC16,TOPIC17,TOPIC18,TOPIC19,TOPIC20,TOPIC21,TOPIC22,TOPIC23,...,TOPIC1176,TOPIC1177,TOPIC1178,TOPIC1179,TOPIC1180,TOPIC1181,TOPIC1182,TOPIC1183,TOPIC1184,TOPIC1185,TOPIC1186,TOPIC1187,TOPIC1188,TOPIC1189,TOPIC1190,TOPIC1191,TOPIC1192,TOPIC1193,TOPIC1194,TOPIC1195,TOPIC1196,TOPIC1197,TOPIC1198,TOPIC1199,class
0,4518,0.047554,-0.000308,0.023648,0.084735,0.000874,0.063103,-0.005063,-0.009235,0.045759,-0.051397,0.102817,-0.01351,0.076356,0.037979,0.05534,0.014945,0.004636,0.037343,0.023181,-0.035382,-0.015663,-0.012293,-0.030427,-0.066387,...,-0.007484,-0.004622,0.01443,0.022647,0.000322,0.007454,0.022939,0.010841,-0.015993,0.003491,0.013125,0.005661,-0.000679,-0.002385,0.002657,-0.000948,0.001994,-0.02017,-0.005735,0.002075,-0.008209,0.009179,-0.00057,0.005638,pos
1,10415,0.057894,-0.002123,0.041045,0.059079,0.003419,0.026197,0.008809,-0.001602,0.037326,-0.037092,0.014471,-0.007263,0.022827,0.000362,-0.035306,-0.028569,-0.012759,-0.048049,0.006494,-0.025306,0.004487,-0.038562,-0.012656,-0.020215,...,0.076282,0.001432,-0.027812,-0.025739,-0.02213,-0.021077,-0.019178,0.077735,0.084677,-0.022941,-0.005463,0.035403,0.055995,-0.017173,0.018196,0.015433,0.001089,-0.024572,0.094193,-0.088402,0.015108,0.007066,-0.042577,-0.058843,neg
2,7098,0.226327,0.041921,0.124921,0.291891,-0.030388,0.082626,-0.037958,-0.291264,0.184703,-0.155528,0.078627,0.249147,0.169286,-0.218392,-0.209313,-0.574267,0.040396,-1.118784,-0.225254,-0.044344,-0.337657,0.143748,-0.004862,0.179439,...,-0.017312,0.032866,-0.062766,-0.023922,-0.057547,0.089804,0.00543,-0.035278,0.02802,0.029487,-0.053324,-0.000339,0.05587,0.073867,-0.076718,0.000163,0.047404,-0.025245,-0.004108,0.027324,-0.038161,-0.021096,-0.018557,0.053247,neg
3,4396,0.011175,-0.001934,0.005822,0.014736,-0.001519,0.000643,0.01077,-0.004277,0.00101,-0.005591,0.011212,-0.006614,0.005003,-0.008331,0.001702,-0.005254,0.000745,0.000625,0.007746,0.004145,0.007582,0.011021,0.0136,-0.009401,...,-0.055257,0.002263,0.020749,-0.019761,0.014488,0.050875,0.037721,0.022402,-0.018292,-0.009705,0.048978,-0.014046,-0.001058,0.003672,-0.069564,0.025773,0.011295,0.048014,-0.02264,-0.048314,0.035679,0.012249,-0.026116,-0.004993,pos
4,2812,0.023912,0.002982,0.008951,0.038452,0.015734,0.018588,0.004741,0.005001,-0.004422,-0.014483,0.027652,0.007508,0.023304,-0.00714,-0.002079,-0.017393,-0.019381,-0.01003,0.023909,0.016616,-0.037448,-0.000804,-0.024296,0.018758,...,0.01808,-0.005512,0.020392,0.008568,-0.040549,-0.013891,0.045739,-0.077711,-0.088563,0.009808,-0.033073,0.024886,-0.069634,-0.018741,0.027686,0.057021,0.079538,-0.105252,0.011196,-0.060068,-0.029165,0.006014,-0.042851,-0.002392,pos


In [5]:
file_name = 'sentiment_polarity/polarity_mturk_lsa_topics'
extension = '.csv'
df_ma = pd.read_csv(file_name + extension, sep=',')
df_ma.head()

Unnamed: 0,id,annotator,TOPIC0,TOPIC1,TOPIC2,TOPIC3,TOPIC4,TOPIC5,TOPIC6,TOPIC7,TOPIC8,TOPIC9,TOPIC10,TOPIC11,TOPIC12,TOPIC13,TOPIC14,TOPIC15,TOPIC16,TOPIC17,TOPIC18,TOPIC19,TOPIC20,TOPIC21,TOPIC22,...,TOPIC1176,TOPIC1177,TOPIC1178,TOPIC1179,TOPIC1180,TOPIC1181,TOPIC1182,TOPIC1183,TOPIC1184,TOPIC1185,TOPIC1186,TOPIC1187,TOPIC1188,TOPIC1189,TOPIC1190,TOPIC1191,TOPIC1192,TOPIC1193,TOPIC1194,TOPIC1195,TOPIC1196,TOPIC1197,TOPIC1198,TOPIC1199,class
0,4518,A2HD5XMM48KKJW,0.047554,-0.000308,0.023648,0.084735,0.000874,0.063103,-0.005063,-0.009235,0.045759,-0.051397,0.102817,-0.01351,0.076356,0.037979,0.05534,0.014945,0.004636,0.037343,0.023181,-0.035382,-0.015663,-0.012293,-0.030427,...,-0.007484,-0.004622,0.01443,0.022647,0.000322,0.007454,0.022939,0.010841,-0.015993,0.003491,0.013125,0.005661,-0.000679,-0.002385,0.002657,-0.000948,0.001994,-0.02017,-0.005735,0.002075,-0.008209,0.009179,-0.00057,0.005638,pos
1,10415,A2HD5XMM48KKJW,0.057894,-0.002123,0.041045,0.059079,0.003419,0.026197,0.008809,-0.001602,0.037326,-0.037092,0.014471,-0.007263,0.022827,0.000362,-0.035306,-0.028569,-0.012759,-0.048049,0.006494,-0.025306,0.004487,-0.038562,-0.012656,...,0.076282,0.001432,-0.027812,-0.025739,-0.02213,-0.021077,-0.019178,0.077735,0.084677,-0.022941,-0.005463,0.035403,0.055995,-0.017173,0.018196,0.015433,0.001089,-0.024572,0.094193,-0.088402,0.015108,0.007066,-0.042577,-0.058843,neg
2,7098,A2HD5XMM48KKJW,0.226327,0.041921,0.124921,0.291891,-0.030388,0.082626,-0.037958,-0.291264,0.184703,-0.155528,0.078627,0.249147,0.169286,-0.218392,-0.209313,-0.574267,0.040396,-1.118784,-0.225254,-0.044344,-0.337657,0.143748,-0.004862,...,-0.017312,0.032866,-0.062766,-0.023922,-0.057547,0.089804,0.00543,-0.035278,0.02802,0.029487,-0.053324,-0.000339,0.05587,0.073867,-0.076718,0.000163,0.047404,-0.025245,-0.004108,0.027324,-0.038161,-0.021096,-0.018557,0.053247,pos
3,4396,A2HD5XMM48KKJW,0.011175,-0.001934,0.005822,0.014736,-0.001519,0.000643,0.01077,-0.004277,0.00101,-0.005591,0.011212,-0.006614,0.005003,-0.008331,0.001702,-0.005254,0.000745,0.000625,0.007746,0.004145,0.007582,0.011021,0.0136,...,-0.055257,0.002263,0.020749,-0.019761,0.014488,0.050875,0.037721,0.022402,-0.018292,-0.009705,0.048978,-0.014046,-0.001058,0.003672,-0.069564,0.025773,0.011295,0.048014,-0.02264,-0.048314,0.035679,0.012249,-0.026116,-0.004993,neg
4,2812,A2HD5XMM48KKJW,0.023912,0.002982,0.008951,0.038452,0.015734,0.018588,0.004741,0.005001,-0.004422,-0.014483,0.027652,0.007508,0.023304,-0.00714,-0.002079,-0.017393,-0.019381,-0.01003,0.023909,0.016616,-0.037448,-0.000804,-0.024296,...,0.01808,-0.005512,0.020392,0.008568,-0.040549,-0.013891,0.045739,-0.077711,-0.088563,0.009808,-0.033073,0.024886,-0.069634,-0.018741,0.027686,0.057021,0.079538,-0.105252,0.011196,-0.060068,-0.029165,0.006014,-0.042851,-0.002392,pos


In [6]:
print('Antes:',len(df_ma.columns))
df_original = feature_extraction(df_ma, 50)
print('Despues:',len(df_original.columns))

Antes: 1203
Despues: 53


In [7]:
new_annotators = ['A207OR9LV0PAPY', 'APPTLVXZD0SKE', 'ARX0S1CIDJLOX']
df_annotators = pd.DataFrame([], columns=df_original.columns)
for annotator in new_annotators:
    new_data = df_original[df_original['annotator'] == annotator]
    df_annotators = pd.concat([df_annotators, new_data])
df_annotators.head()

Unnamed: 0,TOPIC1,TOPIC4,TOPIC6,TOPIC8,TOPIC10,TOPIC11,TOPIC12,TOPIC13,TOPIC15,TOPIC16,TOPIC17,TOPIC18,TOPIC19,TOPIC21,TOPIC22,TOPIC23,TOPIC24,TOPIC25,TOPIC27,TOPIC28,TOPIC29,TOPIC30,TOPIC36,TOPIC51,TOPIC54,...,TOPIC77,TOPIC96,TOPIC99,TOPIC102,TOPIC104,TOPIC108,TOPIC114,TOPIC117,TOPIC120,TOPIC126,TOPIC130,TOPIC139,TOPIC153,TOPIC156,TOPIC161,TOPIC168,TOPIC182,TOPIC206,TOPIC216,TOPIC256,TOPIC412,TOPIC698,id,class,annotator
29,0.026265,0.005836,-0.058384,0.068099,0.046676,0.068632,0.045164,-0.149704,-0.146824,-0.047272,-0.130747,0.298686,-0.2308,-0.508297,0.010583,0.065435,0.277645,0.199375,0.057364,0.020261,-0.178709,-0.010904,-0.078122,-0.047526,0.008761,...,0.109632,0.056707,0.075165,-0.251063,0.082748,-0.06642,-0.030173,0.09689,-0.309995,-0.253182,0.14344,0.031489,0.079333,0.079727,-0.038151,-0.057568,-0.114274,-0.149631,-0.010173,-0.069484,-0.126252,-0.194524,7861,pos,A207OR9LV0PAPY
30,-0.652935,0.010219,0.005537,-0.02281,-0.015517,0.030431,-0.016695,0.034476,0.016158,0.019823,-0.038026,-0.013199,-0.001055,0.027725,-0.004997,-0.098327,0.037022,0.01049,-0.006502,0.008278,0.044648,-0.058749,-0.010941,0.005625,0.104868,...,0.123959,0.305288,-0.256859,-0.001266,-0.100359,0.057512,-0.037252,-0.030529,-0.032375,-0.147131,0.015536,-0.046338,0.028774,-0.07427,0.077002,0.060906,-0.049207,0.037693,0.145181,-0.043245,-0.051528,-0.132053,5133,pos,A207OR9LV0PAPY
31,0.046977,6.6e-05,-0.006413,0.121875,-0.006966,0.041859,0.060195,-0.097424,-0.042616,0.002102,-0.029026,0.043585,-0.011951,0.073692,0.066541,-0.005782,-0.09624,0.015593,-0.061484,-0.015723,0.126479,-0.029079,-0.114171,-0.18402,-0.280568,...,-0.114054,-0.369638,0.248372,-0.093069,-0.188095,-0.050998,-0.12989,0.331439,-0.179582,0.004077,-0.023437,0.051633,-0.053204,-0.038386,0.089091,0.014311,0.058926,0.004707,-0.006507,-0.025703,-0.055696,-0.037273,8474,pos,A207OR9LV0PAPY
32,0.006452,-0.026846,-0.018023,0.040396,0.02732,0.037453,0.054915,-0.023562,0.003152,0.025596,0.020517,0.053841,0.012158,-0.037929,0.029414,-0.084596,-0.042488,0.050069,-0.024643,-0.024533,0.059961,-0.039426,0.018915,0.055242,-0.036203,...,-0.197155,0.052215,0.085191,-0.044674,-0.081145,0.011444,0.05463,0.095793,0.120825,-0.110421,0.10745,-0.268115,0.210317,-0.061487,-0.063853,-0.010133,-0.072267,0.073333,0.009325,0.046392,-0.108627,0.142114,8774,pos,A207OR9LV0PAPY
33,-0.599462,0.02029,-0.031644,0.042543,0.034092,0.018871,0.139619,-0.044247,-0.041552,-0.090757,-0.044764,0.002698,0.017579,0.017584,0.092124,-0.114673,-0.038205,-0.056928,-0.124534,-0.327542,0.533285,-0.084217,-0.53108,0.083895,-0.520896,...,0.026166,-0.248033,-0.271753,0.002627,0.066152,-0.066389,-0.261868,0.145192,-0.028575,-0.077346,0.145505,0.076858,-0.023733,0.118554,0.16545,-0.17114,-0.078592,0.022612,0.009254,0.358688,-0.131926,-0.031723,9387,pos,A207OR9LV0PAPY


In [10]:
print('Total Annotators:', len(df_original.annotator.unique()))

Total Annotators: 203


In [209]:
df = pd.DataFrame([], columns=df_original.columns)
for experiment in df_annotators.id.unique():
    new_data = df_annotators[df_annotators['id'] == experiment]
    if len(new_data) >= 3:
        df = pd.concat([df, new_data])
experiments = df.id.unique()
df = df.sort_values(by=['id'], ascending=True)
df.head()

Unnamed: 0,TOPIC1,TOPIC4,TOPIC5,TOPIC6,TOPIC8,TOPIC9,TOPIC10,TOPIC12,TOPIC13,TOPIC15,TOPIC16,TOPIC17,TOPIC18,TOPIC19,TOPIC22,TOPIC23,TOPIC24,TOPIC25,TOPIC26,TOPIC27,TOPIC28,TOPIC29,TOPIC30,TOPIC36,TOPIC51,...,TOPIC68,TOPIC74,TOPIC77,TOPIC96,TOPIC99,TOPIC108,TOPIC114,TOPIC118,TOPIC120,TOPIC126,TOPIC130,TOPIC139,TOPIC149,TOPIC163,TOPIC180,TOPIC182,TOPIC216,TOPIC310,TOPIC330,TOPIC384,TOPIC478,TOPIC568,id,class,annotator
9947,0.797293,-0.393835,0.010553,-0.765545,-0.067711,0.092816,0.118772,-0.124604,0.031879,0.142809,0.040363,-0.09034,0.063224,-0.045906,-0.043229,-0.028701,-0.069175,0.027071,-0.111294,0.145857,0.036626,0.004328,-0.04177,-0.006973,-0.256675,...,-0.274324,-0.031331,-0.136477,-0.134874,-0.124342,-0.069364,-0.031968,0.079208,0.144827,0.080146,-0.069457,0.106377,-0.015777,-0.121552,0.142521,0.017606,0.032022,-0.014918,0.020784,-0.030515,0.016883,-0.001768,4,pos,ARX0S1CIDJLOX
9932,0.797293,-0.393835,0.010553,-0.765545,-0.067711,0.092816,0.118772,-0.124604,0.031879,0.142809,0.040363,-0.09034,0.063224,-0.045906,-0.043229,-0.028701,-0.069175,0.027071,-0.111294,0.145857,0.036626,0.004328,-0.04177,-0.006973,-0.256675,...,-0.274324,-0.031331,-0.136477,-0.134874,-0.124342,-0.069364,-0.031968,0.079208,0.144827,0.080146,-0.069457,0.106377,-0.015777,-0.121552,0.142521,0.017606,0.032022,-0.014918,0.020784,-0.030515,0.016883,-0.001768,4,pos,APPTLVXZD0SKE
9942,0.797293,-0.393835,0.010553,-0.765545,-0.067711,0.092816,0.118772,-0.124604,0.031879,0.142809,0.040363,-0.09034,0.063224,-0.045906,-0.043229,-0.028701,-0.069175,0.027071,-0.111294,0.145857,0.036626,0.004328,-0.04177,-0.006973,-0.256675,...,-0.274324,-0.031331,-0.136477,-0.134874,-0.124342,-0.069364,-0.031968,0.079208,0.144827,0.080146,-0.069457,0.106377,-0.015777,-0.121552,0.142521,0.017606,0.032022,-0.014918,0.020784,-0.030515,0.016883,-0.001768,4,pos,A207OR9LV0PAPY
8523,-0.002345,-0.017163,0.027796,-0.00252,0.001573,-0.015614,0.017306,0.018168,-0.031097,-0.03302,-0.025358,-0.03425,0.021368,0.013682,-0.006685,0.005905,-0.032409,0.00816,-0.064073,-0.01503,-0.013863,0.071986,-0.054235,-0.023188,0.055033,...,0.083135,-0.135178,-0.034457,-0.014758,0.024719,-0.025759,0.040669,0.097837,0.026123,0.007125,-0.066957,0.054315,-0.008519,-0.035502,-0.060703,0.05176,-0.048917,-0.071083,0.030925,0.036526,0.025783,-0.012964,26,pos,ARX0S1CIDJLOX
8518,-0.002345,-0.017163,0.027796,-0.00252,0.001573,-0.015614,0.017306,0.018168,-0.031097,-0.03302,-0.025358,-0.03425,0.021368,0.013682,-0.006685,0.005905,-0.032409,0.00816,-0.064073,-0.01503,-0.013863,0.071986,-0.054235,-0.023188,0.055033,...,0.083135,-0.135178,-0.034457,-0.014758,0.024719,-0.025759,0.040669,0.097837,0.026123,0.007125,-0.066957,0.054315,-0.008519,-0.035502,-0.060703,0.05176,-0.048917,-0.071083,0.030925,0.036526,0.025783,-0.012964,26,pos,APPTLVXZD0SKE


In [210]:
df_experiments = df_gold[df_gold['id'].isin(experiments)]
df_experiments = df_experiments.sort_values(by=['id'], ascending=True)
df_experiments = df_experiments.loc[:,['id','class']].reset_index(drop=True)
df_experiments.head()

Unnamed: 0,id,class
0,4,pos
1,26,pos
2,53,pos
3,87,pos
4,103,pos


In [211]:
# Sort Final Annotators
sorted_df = df.sort_values(by=['id'], ascending=True)
sorted_df = sorted_df.groupby('id').first()
sorted_df = sorted_df.drop(columns=['annotator','class'])
sorted_df.head()

Unnamed: 0_level_0,TOPIC1,TOPIC4,TOPIC5,TOPIC6,TOPIC8,TOPIC9,TOPIC10,TOPIC12,TOPIC13,TOPIC15,TOPIC16,TOPIC17,TOPIC18,TOPIC19,TOPIC22,TOPIC23,TOPIC24,TOPIC25,TOPIC26,TOPIC27,TOPIC28,TOPIC29,TOPIC30,TOPIC36,TOPIC51,TOPIC54,TOPIC60,TOPIC61,TOPIC68,TOPIC74,TOPIC77,TOPIC96,TOPIC99,TOPIC108,TOPIC114,TOPIC118,TOPIC120,TOPIC126,TOPIC130,TOPIC139,TOPIC149,TOPIC163,TOPIC180,TOPIC182,TOPIC216,TOPIC310,TOPIC330,TOPIC384,TOPIC478,TOPIC568
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1
4,0.797293,-0.393835,0.010553,-0.765545,-0.067711,0.092816,0.118772,-0.124604,0.031879,0.142809,0.040363,-0.09034,0.063224,-0.045906,-0.043229,-0.028701,-0.069175,0.027071,-0.111294,0.145857,0.036626,0.004328,-0.04177,-0.006973,-0.256675,-0.049645,0.138644,-0.07052,-0.274324,-0.031331,-0.136477,-0.134874,-0.124342,-0.069364,-0.031968,0.079208,0.144827,0.080146,-0.069457,0.106377,-0.015777,-0.121552,0.142521,0.017606,0.032022,-0.014918,0.020784,-0.030515,0.016883,-0.001768
26,-0.002345,-0.017163,0.027796,-0.00252,0.001573,-0.015614,0.017306,0.018168,-0.031097,-0.03302,-0.025358,-0.03425,0.021368,0.013682,-0.006685,0.005905,-0.032409,0.00816,-0.064073,-0.01503,-0.013863,0.071986,-0.054235,-0.023188,0.055033,0.006182,0.294874,0.176244,0.083135,-0.135178,-0.034457,-0.014758,0.024719,-0.025759,0.040669,0.097837,0.026123,0.007125,-0.066957,0.054315,-0.008519,-0.035502,-0.060703,0.05176,-0.048917,-0.071083,0.030925,0.036526,0.025783,-0.012964
53,0.001302,0.023859,0.046253,0.029121,0.056407,-0.040578,0.031937,0.068591,-0.07775,0.011058,0.022814,-0.027503,0.039172,0.007807,0.043122,0.048125,0.001618,-0.041988,0.045506,-0.005745,-0.010986,0.120074,-0.156283,-0.056757,-0.031636,-0.255593,0.133789,-0.358778,-0.220749,-0.234265,0.143818,0.088597,0.088208,0.010902,0.081143,-0.027483,-0.062268,0.002592,0.041539,0.074529,0.016827,0.044548,0.042335,-0.019066,0.028163,-0.081343,-0.183194,0.068228,0.066103,0.004491
87,0.773365,0.023813,-0.054795,0.075729,-0.007952,0.028159,-0.032129,0.010534,0.00209,-0.043997,-0.014801,-0.060688,0.092124,-0.005874,0.008212,0.006515,-0.102166,-0.149519,-0.23548,-0.083642,0.12967,-0.108762,0.008656,0.130491,-0.049099,0.088159,0.027713,0.006171,0.015313,-0.01063,-0.038219,0.055579,0.025519,-0.066877,-0.006714,0.06289,-0.169486,0.024563,0.143775,0.052771,-0.167482,-0.102534,-0.086773,0.10703,0.008302,-0.056895,0.18612,-0.045655,-0.135266,-0.065343
103,0.010929,0.0195,-0.005192,-0.011546,0.005507,0.000671,0.00809,0.005373,-0.039098,-0.001835,0.032736,-0.034811,0.005949,-0.001719,0.028419,0.032372,-0.004549,-0.022183,0.012779,-0.03975,0.010439,0.02068,0.026465,-0.020348,-0.041684,-0.006877,-0.052164,0.051695,-0.050243,-0.069645,0.092146,0.113016,-0.120556,-0.123408,-0.17726,0.081649,0.051347,-0.010201,-0.046867,0.070147,-0.012927,0.009324,0.04651,0.010164,-0.007048,0.055262,-0.082473,0.056088,0.047905,0.010368


In [212]:
# Get Annotators Responses
annotators = df.pivot(index='id', columns='annotator', values='class')
annotators = annotators.sort_values(by=['id'], ascending=True)
annotators.head()

annotator,A207OR9LV0PAPY,APPTLVXZD0SKE,ARX0S1CIDJLOX
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,pos,pos,pos
26,pos,pos,pos
53,pos,pos,pos
87,pos,pos,pos
103,pos,pos,pos


In [213]:
labels_array = []
for row in annotators.iterrows():
    index = row[0]
    value = row[1]
    classValue = df_experiments.loc[df_experiments['id'] == index]
    value['y'] = classValue['class'].values[0]
    labels_array.append(value)

In [214]:
labels = pd.DataFrame(labels_array, columns=labels_array[0].keys())
labels.head()

annotator,A207OR9LV0PAPY,APPTLVXZD0SKE,ARX0S1CIDJLOX,y
4,pos,pos,pos,pos
26,pos,pos,pos,pos
53,pos,pos,pos,pos
87,pos,pos,pos,pos
103,pos,pos,pos,pos


In [215]:
for label in labels:
    labels[label] = [True if i == 'pos' else False for i in labels[label]]
    print(label, round(100*labels[label].sum()/len(labels)))
labels.head()

A207OR9LV0PAPY 66.0
APPTLVXZD0SKE 54.0
ARX0S1CIDJLOX 52.0
y 52.0


annotator,A207OR9LV0PAPY,APPTLVXZD0SKE,ARX0S1CIDJLOX,y
4,True,True,True,True
26,True,True,True,True
53,True,True,True,True
87,True,True,True,True
103,True,True,True,True


In [216]:
sorted_df.to_csv('../Sinteticas/sentiment-polarity/database_attr.csv', sep=',', index=False)
labels.to_csv('../Sinteticas/sentiment-polarity/database_labels.csv', sep=',', index=False)

# Gold Standard Database

In [14]:
df_gold_attr = df_gold.drop(columns=['class', 'id'])
df_gold_labels = pd.DataFrame(df_gold['class'])
df_gold_labels = df_gold_labels.rename(columns={"class": "y"})

In [15]:
for column in df_gold_labels.columns:
    df_gold_labels[column] = [True if row == 'pos' else False for row in df_gold_labels[column]]

In [16]:
df_gold_attr.head()

Unnamed: 0,TOPIC0,TOPIC1,TOPIC2,TOPIC3,TOPIC4,TOPIC5,TOPIC6,TOPIC7,TOPIC8,TOPIC9,TOPIC10,TOPIC11,TOPIC12,TOPIC13,TOPIC14,TOPIC15,TOPIC16,TOPIC17,TOPIC18,TOPIC19,TOPIC20,TOPIC21,TOPIC22,TOPIC23,TOPIC24,...,TOPIC1175,TOPIC1176,TOPIC1177,TOPIC1178,TOPIC1179,TOPIC1180,TOPIC1181,TOPIC1182,TOPIC1183,TOPIC1184,TOPIC1185,TOPIC1186,TOPIC1187,TOPIC1188,TOPIC1189,TOPIC1190,TOPIC1191,TOPIC1192,TOPIC1193,TOPIC1194,TOPIC1195,TOPIC1196,TOPIC1197,TOPIC1198,TOPIC1199
0,0.047554,-0.000308,0.023648,0.084735,0.000874,0.063103,-0.005063,-0.009235,0.045759,-0.051397,0.102817,-0.01351,0.076356,0.037979,0.05534,0.014945,0.004636,0.037343,0.023181,-0.035382,-0.015663,-0.012293,-0.030427,-0.066387,0.001208,...,0.023256,-0.007484,-0.004622,0.01443,0.022647,0.000322,0.007454,0.022939,0.010841,-0.015993,0.003491,0.013125,0.005661,-0.000679,-0.002385,0.002657,-0.000948,0.001994,-0.02017,-0.005735,0.002075,-0.008209,0.009179,-0.00057,0.005638
1,0.057894,-0.002123,0.041045,0.059079,0.003419,0.026197,0.008809,-0.001602,0.037326,-0.037092,0.014471,-0.007263,0.022827,0.000362,-0.035306,-0.028569,-0.012759,-0.048049,0.006494,-0.025306,0.004487,-0.038562,-0.012656,-0.020215,-0.018308,...,0.041451,0.076282,0.001432,-0.027812,-0.025739,-0.02213,-0.021077,-0.019178,0.077735,0.084677,-0.022941,-0.005463,0.035403,0.055995,-0.017173,0.018196,0.015433,0.001089,-0.024572,0.094193,-0.088402,0.015108,0.007066,-0.042577,-0.058843
2,0.226327,0.041921,0.124921,0.291891,-0.030388,0.082626,-0.037958,-0.291264,0.184703,-0.155528,0.078627,0.249147,0.169286,-0.218392,-0.209313,-0.574267,0.040396,-1.118784,-0.225254,-0.044344,-0.337657,0.143748,-0.004862,0.179439,0.027508,...,-0.075799,-0.017312,0.032866,-0.062766,-0.023922,-0.057547,0.089804,0.00543,-0.035278,0.02802,0.029487,-0.053324,-0.000339,0.05587,0.073867,-0.076718,0.000163,0.047404,-0.025245,-0.004108,0.027324,-0.038161,-0.021096,-0.018557,0.053247
3,0.011175,-0.001934,0.005822,0.014736,-0.001519,0.000643,0.01077,-0.004277,0.00101,-0.005591,0.011212,-0.006614,0.005003,-0.008331,0.001702,-0.005254,0.000745,0.000625,0.007746,0.004145,0.007582,0.011021,0.0136,-0.009401,0.011051,...,0.018782,-0.055257,0.002263,0.020749,-0.019761,0.014488,0.050875,0.037721,0.022402,-0.018292,-0.009705,0.048978,-0.014046,-0.001058,0.003672,-0.069564,0.025773,0.011295,0.048014,-0.02264,-0.048314,0.035679,0.012249,-0.026116,-0.004993
4,0.023912,0.002982,0.008951,0.038452,0.015734,0.018588,0.004741,0.005001,-0.004422,-0.014483,0.027652,0.007508,0.023304,-0.00714,-0.002079,-0.017393,-0.019381,-0.01003,0.023909,0.016616,-0.037448,-0.000804,-0.024296,0.018758,-0.003991,...,-0.027748,0.01808,-0.005512,0.020392,0.008568,-0.040549,-0.013891,0.045739,-0.077711,-0.088563,0.009808,-0.033073,0.024886,-0.069634,-0.018741,0.027686,0.057021,0.079538,-0.105252,0.011196,-0.060068,-0.029165,0.006014,-0.042851,-0.002392


In [17]:
df_gold_attr.to_csv('../Sinteticas/sentiment-polarity/database_attr.csv', sep=',', index=False)
df_gold_labels.to_csv('../Sinteticas/sentiment-polarity/database_labels.csv', sep=',', index=False)