In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [2]:
ids = pd.read_csv("scale_data/scaledata/Dennis+Schwartz/id.Dennis+Schwartz.txt", sep="[\r\n]+", header=None, names=["id"])
label_3class = pd.read_csv("scale_data/scaledata/Dennis+Schwartz/label.3class.Dennis+Schwartz.txt", sep="[\r\n]+", names=["3class_label"])
label_4class = pd.read_csv("scale_data/scaledata/Dennis+Schwartz/label.4class.Dennis+Schwartz.txt", sep="[\r\n]+",names=["4class_label"])
rating = pd.read_csv("scale_data/scaledata/Dennis+Schwartz/rating.Dennis+Schwartz.txt", sep="[\r\n]+",names=["rating"])
subj = pd.read_csv("scale_data/scaledata/Dennis+Schwartz/subj.Dennis+Schwartz.txt", sep="[\r\n]+",names=["subj_extraction"])

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """


In [3]:
dennis = pd.concat([ids,label_3class, label_4class, rating, subj], axis = 1)

In [4]:
dennis.head(10)

Unnamed: 0,id,3class_label,4class_label,rating,subj_extraction
0,29420,0,0,0.1,"in my opinion , a movie reviewer's most import..."
1,17219,0,0,0.2,"you can watch this movie , that is based on a ..."
2,18406,0,0,0.2,"this is asking a lot to believe , and though i..."
3,18648,0,0,0.2,no heroes and no story are the main attributes...
4,20021,0,0,0.2,"this is not an art movie , yet i saw it an art..."
5,20454,0,0,0.2,a satirical film where warren beatty tries to ...
6,20473,0,0,0.2,this sci-fi'er left a lot to be desired . it d...
7,20538,0,0,0.2,"this is a dull , unfunny film , lacking an edg..."
8,21002,0,0,0.2,instead he makes him an offer he can't very we...
9,21739,0,0,0.2,vibrant culture . great people . thinking of h...


In [5]:
dennis["strongly neg"]=dennis.apply(lambda x:x["4class_label"]==0, axis=1)
dennis["neg"]=dennis.apply(lambda x:x["4class_label"]==1, axis=1)
dennis["pos"]=dennis.apply(lambda x:x["4class_label"]==2, axis=1)
dennis["strongly pos"]=dennis.apply(lambda x:x["4class_label"]==3, axis=1)

In [6]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(dennis, test_size=0.2)

In [7]:
train.head()

Unnamed: 0,id,3class_label,4class_label,rating,subj_extraction,strongly neg,neg,pos,strongly pos
946,21199,2,3,0.8,"the film critic , jonathan rosenbaum , has wri...",False,False,False,True
843,24082,2,2,0.7,everyone knows how bad it is to be called a ch...,False,False,True,False
85,24840,0,0,0.3,"a modern day comedy taking place in havana , c...",True,False,False,False
314,28308,0,1,0.4,this is an obscure bela lugosi film and it's n...,False,True,False,False
308,28013,0,1,0.4,"a sci-fi film , originally started as a studen...",False,True,False,False


In [8]:
train["subj_extraction"][8]

'instead he makes him an offer he can\'t very well refuse , stay with me and i\'ll give you everything you want . it all turns out to be a tedious game to see who is in control : brains or brawn . the kid plays his part like he\'s a lump of coal and the sex is so uninviting , making it seem just as boring as watching paint dry on a wall . and like all the characters in this film , is not developed . there is no rhyme or reason for all this sleaze to be taking place , except for making the audience into voyeurs , like the invalid . what is quickly resolved , is that glover isn\'t queer , just manipulaive and mentally sick . that explanation of him doesn\'t ring true , anyway , as he always has a blank and uninteresting look on his face , and nothing he does throughout the film is taking him to anything but a dead-end . so it becomes hard to feel anything for him , but pity and contempt . the sadistic games are better delivered now with the presence of helen , but the menage a trois does

In [9]:
train.shape

(821, 9)

In [10]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): 
    return re_tok.sub(r' \1 ', s).split()


In [11]:
tfidf = TfidfVectorizer(ngram_range=(1,3), tokenizer=tokenize, stop_words="english", strip_accents='unicode', use_idf=True,
               smooth_idf=True, sublinear_tf=True)

In [12]:
x = tfidf.fit_transform(train["subj_extraction"])
test_vec = tfidf.transform(test["subj_extraction"])

In [66]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [67]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [76]:
preds = np.zeros((test.shape[0], 4))

for i, j in enumerate(["strongly neg","neg","pos","strongly pos"]):
    m,r = get_mdl(train[j])
    preds[:,i] = m.predict_proba(test_vec.multiply(r))[:,1]

In [77]:
preds

array([[ 0.10931476,  0.41027306,  0.28207832,  0.06892893],
       [ 0.13770649,  0.3932625 ,  0.32138913,  0.05511698],
       [ 0.10271261,  0.41561532,  0.30533183,  0.04377082],
       [ 0.08285673,  0.42706208,  0.33575238,  0.03985275],
       [ 0.09364587,  0.42968696,  0.2487385 ,  0.04073993],
       [ 0.13576145,  0.40772985,  0.34452329,  0.09776425],
       [ 0.14411228,  0.4134449 ,  0.26157067,  0.04708683],
       [ 0.0910279 ,  0.41251933,  0.29620693,  0.0384327 ],
       [ 0.08219978,  0.40684527,  0.26019244,  0.05302534],
       [ 0.11039718,  0.39571967,  0.29160962,  0.04770335],
       [ 0.09435631,  0.43722036,  0.32407648,  0.07167701],
       [ 0.08082152,  0.40783737,  0.30475508,  0.07144997],
       [ 0.05988391,  0.47655855,  0.18629143,  0.02654458],
       [ 0.14215829,  0.42006043,  0.29977753,  0.04892364],
       [ 0.08379988,  0.40572724,  0.34059374,  0.05765979],
       [ 0.06958399,  0.39160217,  0.36593131,  0.05228533],
       [ 0.08801907,  0.

In [79]:
test.head(100)

Unnamed: 0,id,3class_label,4class_label,rating,subj_extraction,strongly neg,neg,pos,strongly pos
431,22908,1,1,0.5,"a minor noir film , originally made for radio ...",False,True,False,False
890,28342,2,2,0.7,love's a bitch is an intricate film noir tale ...,False,False,True,False
505,26675,1,1,0.5,"inspired by a pushkin novel , which it conside...",False,True,False,False
847,24597,2,2,0.7,"taxman is a wonderfully played cynical , offbe...",False,False,True,False
542,27872,1,1,0.5,it plays more like a theatrical ghost story or...,False,True,False,False
927,18307,2,3,0.8,"this is a come to your own conclusion movie , ...",False,False,False,True
128,28199,0,0,0.3,a film that could have been an interesting stu...,True,False,False,False
511,26830,1,1,0.5,director-writer andrew l . stone keeps the ter...,False,True,False,False
670,23614,1,2,0.6,a more-than-acceptable variation of jacques to...,False,False,True,False
746,28301,1,2,0.6,a road film about trying to reach somewhere . ...,False,False,True,False
