In [64]:
import pandas as pd
import xml.etree.ElementTree as ET

def getDF(path):
    tree = ET.parse(path)
    root = tree.getroot()
    df = pd.DataFrame(columns=('rid','sid', 'text', 'category', 'polarity'))
    reviews = root.getchildren()
    rnum = len(reviews)
    j=0
    k=0
    for j in range(0,rnum):
        review = reviews[j]
        rid = review.attrib['rid']
        sentences = review.getchildren()[0]
        snum = len(sentences)
        for i in range(0,snum):
            sentence = sentences[i]
            sid = sentence.attrib['id']
            text = sentence[0].text
            if len(sentence) > 1:
                opinions = sentence[1]
                for opinion in opinions:
                    category = opinion.attrib['category']
                    polarity = opinion.attrib['polarity']
                    row = dict(zip(['rid','sid', 'text', 'category', 'polarity'], [rid, sid, text, category, polarity]))
                    row_s = pd.Series(row)
                    row_s.name = k
                    df = df.append(row_s)
                    k +=1
            else:
                row = dict(zip(['rid','sid', 'text', 'category', 'polarity'], [rid, sid, text, 'UNKNOWN#UNKNOWN', 'UNKNOWN']))
                row_s = pd.Series(row)
                row_s.name = k
                df = df.append(row_s)
                k+=1
    foo = lambda x: pd.Series([i for i in reversed(x.split('#'))])
    rev = df['category'].apply(foo)
    rev.rename(columns={0:'attribute',1:'entity'},inplace=True)
    df[['attribute','entity']] = rev[['attribute','entity']]
    return df


In [65]:
trainDF = getDF('ABSA-15_Laptops_Train_Data.xml')
testDF = getDF('ABSA15_Laptops_Test.xml')

In [81]:
trainDF.count()

rid          2314
sid          2314
text         2314
category     2314
polarity     2314
attribute    2314
entity       2314
dtype: int64

In [82]:
testDF.count()

rid          1066
sid          1066
text         1066
category     1066
polarity     1066
attribute    1066
entity       1066
dtype: int64

In [66]:
trainDF[(trainDF['category']=='UNKNOWN#UNKNOWN')].head()

Unnamed: 0,rid,sid,text,category,polarity,attribute,entity
0,79,79:0,Being a PC user my whole life....,UNKNOWN#UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN
9,79,79:7,MACS ARE AMAZING!!!,UNKNOWN#UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN
17,264,264:0,"As a lifelong Windows user, I was extremely pl...",UNKNOWN#UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN
38,24,24:10,I did contact HP and share how unhappy I am.,UNKNOWN#UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN
71,348,348:2,"Yes, I have it on the highest available setting.",UNKNOWN#UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN


In [67]:
trainDF.head()

Unnamed: 0,rid,sid,text,category,polarity,attribute,entity
0,79,79:0,Being a PC user my whole life....,UNKNOWN#UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN
1,79,79:1,This computer is absolutely AMAZING!!!,LAPTOP#GENERAL,positive,GENERAL,LAPTOP
2,79,79:2,10 plus hours of battery...,BATTERY#OPERATION_PERFORMANCE,positive,OPERATION_PERFORMANCE,BATTERY
3,79,79:3,super fast processor and really nice graphics ...,CPU#OPERATION_PERFORMANCE,positive,OPERATION_PERFORMANCE,CPU
4,79,79:3,super fast processor and really nice graphics ...,GRAPHICS#GENERAL,positive,GENERAL,GRAPHICS


In [68]:
trainDF.category.unique()

array(['UNKNOWN#UNKNOWN', 'LAPTOP#GENERAL',
       'BATTERY#OPERATION_PERFORMANCE', 'CPU#OPERATION_PERFORMANCE',
       'GRAPHICS#GENERAL', 'HARD_DISC#DESIGN_FEATURES',
       'LAPTOP#OPERATION_PERFORMANCE', 'LAPTOP#USABILITY',
       'LAPTOP#PORTABILITY', 'LAPTOP#PRICE', 'LAPTOP#QUALITY',
       'LAPTOP#DESIGN_FEATURES', 'LAPTOP#MISCELLANEOUS',
       'OS#DESIGN_FEATURES', 'BATTERY#QUALITY', 'SUPPORT#QUALITY',
       'COMPANY#GENERAL', 'DISPLAY#USABILITY', 'DISPLAY#DESIGN_FEATURES',
       'OS#GENERAL', 'LAPTOP#CONNECTIVITY', 'DISPLAY#QUALITY',
       'OS#USABILITY', 'MOUSE#DESIGN_FEATURES', 'SOFTWARE#MISCELLANEOUS',
       'KEYBOARD#DESIGN_FEATURES', 'KEYBOARD#GENERAL', 'SOFTWARE#GENERAL',
       'OPTICAL_DRIVES#QUALITY', 'GRAPHICS#QUALITY', 'HARD_DISC#QUALITY',
       'WARRANTY#GENERAL', 'MOUSE#QUALITY', 'MULTIMEDIA_DEVICES#USABILITY',
       'MULTIMEDIA_DEVICES#QUALITY', 'PORTS#QUALITY', 'DISPLAY#GENERAL',
       'POWER_SUPPLY#QUALITY', 'POWER_SUPPLY#OPERATION_PERFORMANCE',
       

In [70]:
trainDF.groupby(["category", "polarity"]).size()

category                        polarity
BATTERY#GENERAL                 negative      3
                                positive      1
BATTERY#MISCELLANEOUS           negative      4
BATTERY#OPERATION_PERFORMANCE   negative     25
                                neutral       3
                                positive     32
BATTERY#QUALITY                 negative      7
                                neutral       1
                                positive      1
COMPANY#GENERAL                 negative     38
                                neutral       3
                                positive     28
CPU#MISCELLANEOUS               negative      1
                                neutral       1
CPU#OPERATION_PERFORMANCE       negative      1
                                positive      6
CPU#QUALITY                     negative      1
                                positive      1
DISPLAY#DESIGN_FEATURES         negative      4
                                neutral       3

In [71]:
trainDF.groupby(["entity", "polarity"]).size()

entity              polarity
BATTERY             negative     39
                    neutral       4
                    positive     34
COMPANY             negative     38
                    neutral       3
                    positive     28
CPU                 negative      3
                    neutral       1
                    positive      7
DISPLAY             negative     29
                    neutral       7
                    positive     42
FANS_COOLING        negative      3
                    positive      1
GRAPHICS            negative      4
                    neutral       2
                    positive     18
HARDWARE            negative      3
HARD_DISC           negative     12
                    positive      9
KEYBOARD            negative     27
                    neutral       2
                    positive     31
LAPTOP              negative    349
                    neutral      64
                    positive    818
MEMORY              negative      1

In [76]:
trainDF.groupby(["entity", "attribute"]).size()

entity        attribute            
BATTERY       GENERAL                    4
              MISCELLANEOUS              4
              OPERATION_PERFORMANCE     60
              QUALITY                    9
COMPANY       GENERAL                   69
CPU           MISCELLANEOUS              2
              OPERATION_PERFORMANCE      7
              QUALITY                    2
DISPLAY       DESIGN_FEATURES           19
              GENERAL                   13
              MISCELLANEOUS              1
              OPERATION_PERFORMANCE     10
              QUALITY                   31
              USABILITY                  4
FANS_COOLING  DESIGN_FEATURES            1
              OPERATION_PERFORMANCE      1
              QUALITY                    2
GRAPHICS      DESIGN_FEATURES            2
              GENERAL                   16
              MISCELLANEOUS              2
              QUALITY                    4
HARDWARE      QUALITY                    3
HARD_DISC     DESI

In [77]:
len(trainDF.category.unique())

82

In [78]:
len(trainDF.attribute.unique())

10

In [74]:
len(trainDF.entity.unique())

23

In [79]:
trainDF.to_csv('trainDF_by_category.csv')

In [80]:
testDF.to_csv('testDF_by_category.csv')

In [86]:
trainDF.entity.unique()

array(['UNKNOWN', 'LAPTOP', 'BATTERY', 'CPU', 'GRAPHICS', 'HARD_DISC',
       'OS', 'SUPPORT', 'COMPANY', 'DISPLAY', 'MOUSE', 'SOFTWARE',
       'KEYBOARD', 'OPTICAL_DRIVES', 'WARRANTY', 'MULTIMEDIA_DEVICES',
       'PORTS', 'POWER_SUPPLY', 'HARDWARE', 'SHIPPING', 'MEMORY',
       'MOTHERBOARD', 'FANS_COOLING'], dtype=object)

In [87]:
trainDF.attribute.unique()

array(['UNKNOWN', 'GENERAL', 'OPERATION_PERFORMANCE', 'DESIGN_FEATURES',
       'USABILITY', 'PORTABILITY', 'PRICE', 'QUALITY', 'MISCELLANEOUS',
       'CONNECTIVITY'], dtype=object)

In [88]:
trainDF.polarity.unique()

array(['UNKNOWN', 'positive', 'negative', 'neutral'], dtype=object)

In [118]:
trainDF[(trainDF.entity=='FANS_COOLING')]

1462    Fan vents to the side, so no cooling pad neede...
1803    The processor went on me, the fan went and the...
2204    It had a cooling system malfunction after 10 m...
2205    It had a cooling system malfunction after 10 m...
Name: text, dtype: object