In [140]:
import pandas as pd
import numpy as np

In [141]:
# https://www.kaggle.com/datasets/nltkdata/movie-review?fbclid=IwAR3y7GWs4zZROotqPH4LQck3nTI1oaUhBvBC4I8OWIcIwl_IPGUcNqRVBOI
df = pd.read_csv('movie_review.csv', encoding='utf-8')
print(df.shape)
df.head()

(64720, 6)


Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos


In [142]:
df.isna().sum()

fold_id    0
cv_tag     0
html_id    0
sent_id    0
text       0
tag        0
dtype: int64

In [143]:
# How many tags are there?
print(df['tag'].unique())

['pos' 'neg']


In [144]:
import re
RE_BAD_SYMBOLS = re.compile(r'[&#<>{}\[\]\\]')
def impurity(text, min_len=10):
    # Returns the share of bad characters in a text
    if text == None or len(text) < min_len:
        return 0
    return len(RE_BAD_SYMBOLS.findall(text)) / len(text)

df['impurity'] = df['text'].apply(impurity)

In [145]:
# sort by impurity
df.sort_values(by='impurity', ascending=False).head(10)

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag,impurity
45017,3,cv381,21673,37,1 john 4 : 1 [kjv],neg,0.111111
56231,7,cv732,13092,83,&nbsp ; 3 . &nbsp ;,neg,0.105263
56224,7,cv732,13092,76,&nbsp ; 2 . &nbsp ;,neg,0.105263
28111,8,cv855,20661,8,"[katarina] does . """,pos,0.105263
4539,1,cv142,22516,29,proverbs 3 : 5 [kjv],pos,0.1
51191,5,cv575,22598,27,isaiah 43 : 25 [kjv],neg,0.1
15271,4,cv477,22479,30,matthew 19 : 5 [kjv],pos,0.1
8930,2,cv289,6463,2,-moderator],pos,0.090909
20753,6,cv637,1250,5,[expectations - medium],pos,0.086957
56215,7,cv732,13092,67,yes . &nbsp ;,neg,0.076923


In [146]:
print('impurity : ', impurity(df.loc[45017, 'text']))
print(df.loc[45017, 'text'])

impurity :  0.1111111111111111
1 john 4 : 1 [kjv]


In [165]:
import html
def clean(text):
    # convert htnl escapes like &amp; to characters.
    text = html.unescape(text)
    # tags lomke <table> and <a href> are not needed
    text = re.sub(r'<[^>]*>', '', text)
    # markdown URLs like [Some text](http://...) are keep the text
    text = re.sub(r'\[([^\[\]]*)\]\([^\(\)]*\)', r'\1', text)
    # text or code in backticks like [0]
    text = re.sub(r'\[[^\[\]]*\]', ' ', text)
    # standaLone sequence of speacials, matches &# but not #cool
    text = re.sub(r'(?:^|\s)[&#<>{}\[\]+|\\:-]{1,}(z/:\s|$)', ' ', text)
    # standLone sequences of hyphens like --- or ==
    text = re.sub(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)', ' ', text)
    # sequences of white space
    text = re.sub(r'\s+', ' ', text)
    # start with - or [ and end with - or ]
    text = re.sub(r'^[\[\]-]+|[\[\]-]+$', '', text)
    # make a emaLl address no space
    text = re.sub(r'\w+@\w+ . com', '_EMAIL_', text)
    # make a URL no space
    text = re.sub(r'(?:http|ftp)s?://\S+', '_URL_', text)
    # make a number no space
    text = re.sub(r'\d+', '_NUMBER_', text)
    # make a user
    text = re.sub(r'@(\w+)', r'_USER_\1', text)
    # make a sequence of special characters no space
    text = re.sub(r'[\[\]{}()#*+\\|`^~\-_=;:\'"/.,<>?]', '', text)
    # \w&\w split into two words
    text = re.sub(r'(\w)&(\w)', r'\1 and \2', text)
    # change & to and
    text = re.sub(r'&', 'and', text)
    # replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [166]:
df['clean_text'] = df['text'].apply(clean)
df['clean_impurity'] = df['clean_text'].apply(impurity)
# sort by clean_impurity
df.sort_values(by='clean_impurity', ascending=False).head(10)

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag,impurity,clean_text,clean_impurity,tokens
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos,0.0,films adapted from comic books have had plenty...,0.0,"[films, adapted, from, comic, books, have, had..."
43152,3,cv322,21820,17,thankfully there are a few factors that i part...,neg,0.0,thankfully there are a few factors that i part...,0.0,"[thankfully, there, are, a, few, factors, that..."
43139,3,cv322,21820,4,"while the stories are similar , the new versio...",neg,0.0,while the stories are similar the new version ...,0.0,"[while, the, stories, are, similar, the, new, ..."
43140,3,cv322,21820,5,ichabod crane ( johnny depp ) is now apparentl...,neg,0.0,ichabod crane johnny depp is now apparently a ...,0.0,"[ichabod, crane, johnny, depp, is, now, appare..."
43141,3,cv322,21820,6,"the victims are headless , and no traces of ev...",neg,0.0,the victims are headless and no traces of evid...,0.0,"[the, victims, are, headless, and, no, traces,..."
43142,3,cv322,21820,7,now with the help of christina ricci and an or...,neg,0.0,now with the help of christina ricci and an or...,0.0,"[now, with, the, help, of, christina, ricci, a..."
43143,3,cv322,21820,8,sure the headless horseman is an intriguing ch...,neg,0.0,sure the headless horseman is an intriguing ch...,0.0,"[sure, the, headless, horseman, is, an, intrig..."
43144,3,cv322,21820,9,"in the original , the headless horseman was me...",neg,0.0,in the original the headless horseman was mean...,0.0,"[in, the, original, the, headless, horseman, w..."
43145,3,cv322,21820,10,"no one could stop him , not even ichabod .",neg,0.0,no one could stop him not even ichabod,0.0,"[no, one, could, stop, him, not, even, ichabod]"
43146,3,cv322,21820,11,"by the end of the film , it had audiences in d...",neg,0.0,by the end of the film it had audiences in dee...,0.0,"[by, the, end, of, the, film, it, had, audienc..."


In [167]:
df.loc[0,'clean_text']

'films adapted from comic books have had plenty of success whether theyre about superheroes batman superman spawn or geared toward kids casper or the arthouse crowd ghost world but theres never really been a comic book like from hell before'

In [168]:
#count the norm_impurity !=0
df[df['clean_impurity'] != 0].shape

(0, 10)

In [169]:
# tokenize the clean_text
from nltk.tokenize import word_tokenize
df['tokens'] = df['clean_text'].apply(word_tokenize)
df.head(10)

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag,impurity,clean_text,clean_impurity,tokens
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos,0.0,films adapted from comic books have had plenty...,0.0,"[films, adapted, from, comic, books, have, had..."
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos,0.0,for starters it was created by alan moore and ...,0.0,"[for, starters, it, was, created, by, alan, mo..."
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos,0.0,to say moore and campbell thoroughly researche...,0.0,"[to, say, moore, and, campbell, thoroughly, re..."
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos,0.0,the book or graphic novel if you will is over ...,0.0,"[the, book, or, graphic, novel, if, you, will,..."
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos,0.0,in other words dont dismiss this film because ...,0.0,"[in, other, words, dont, dismiss, this, film, ..."
5,0,cv000,29590,5,if you can get past the whole comic book thing...,pos,0.0,if you can get past the whole comic book thing...,0.0,"[if, you, can, get, past, the, whole, comic, b..."
6,0,cv000,29590,6,getting the hughes brothers to direct this see...,pos,0.0,getting the hughes brothers to direct this see...,0.0,"[getting, the, hughes, brothers, to, direct, t..."
7,0,cv000,29590,7,"the ghetto in question is , of course , whitec...",pos,0.0,the ghetto in question is of course whitechape...,0.0,"[the, ghetto, in, question, is, of, course, wh..."
8,0,cv000,29590,8,"it's a filthy , sooty place where the whores (...",pos,0.0,its a filthy sooty place where the whores call...,0.0,"[its, a, filthy, sooty, place, where, the, who..."
9,0,cv000,29590,9,"when the first stiff turns up , copper peter g...",pos,0.0,when the first stiff turns up copper peter god...,0.0,"[when, the, first, stiff, turns, up, copper, p..."
