In [19]:
import pandas as pd
import numpy as np
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [15]:
ids = pd.read_csv("scale_data/scaledata/Dennis+Schwartz/id.Dennis+Schwartz.txt", sep="[\r\n]+", header=None, names=["id"])
label_3class = pd.read_csv("scale_data/scaledata/Dennis+Schwartz/label.3class.Dennis+Schwartz.txt", sep="[\r\n]+", names=["3class_label"])
label_4class = pd.read_csv("scale_data/scaledata/Dennis+Schwartz/label.4class.Dennis+Schwartz.txt", sep="[\r\n]+",names=["4class_label"])
rating = pd.read_csv("scale_data/scaledata/Dennis+Schwartz/rating.Dennis+Schwartz.txt", sep="[\r\n]+",names=["rating"])
subj = pd.read_csv("scale_data/scaledata/Dennis+Schwartz/subj.Dennis+Schwartz.txt", sep="[\r\n]+",names=["subj_extraction"])

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """


In [17]:
dennis = pd.concat([ids,label_3class, label_4class, rating, subj], axis = 1)

In [18]:
dennis.head(10)

Unnamed: 0,id,3class_label,4class_label,rating,subj_extraction
0,29420,0,0,0.1,"in my opinion , a movie reviewer's most import..."
1,17219,0,0,0.2,"you can watch this movie , that is based on a ..."
2,18406,0,0,0.2,"this is asking a lot to believe , and though i..."
3,18648,0,0,0.2,no heroes and no story are the main attributes...
4,20021,0,0,0.2,"this is not an art movie , yet i saw it an art..."
5,20454,0,0,0.2,a satirical film where warren beatty tries to ...
6,20473,0,0,0.2,this sci-fi'er left a lot to be desired . it d...
7,20538,0,0,0.2,"this is a dull , unfunny film , lacking an edg..."
8,21002,0,0,0.2,instead he makes him an offer he can't very we...
9,21739,0,0,0.2,vibrant culture . great people . thinking of h...


In [20]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(dennis, test_size=0.2)

In [22]:
train.head()

Unnamed: 0,id,3class_label,4class_label,rating,subj_extraction
139,28597,0,0,0.3,"the film is filled with violence , vulgar dial..."
702,26463,1,2,0.6,there are also some interesting casting choice...
498,26063,1,1,0.5,these misguided adventures are things a kid ne...
130,28306,0,0,0.3,"a good spoof on the church , but it is ruined ..."
614,17192,1,2,0.6,"keitel , if you stretch your gullability a bit..."


In [37]:
train["subj_extraction"][8]

'instead he makes him an offer he can\'t very well refuse , stay with me and i\'ll give you everything you want . it all turns out to be a tedious game to see who is in control : brains or brawn . the kid plays his part like he\'s a lump of coal and the sex is so uninviting , making it seem just as boring as watching paint dry on a wall . and like all the characters in this film , is not developed . there is no rhyme or reason for all this sleaze to be taking place , except for making the audience into voyeurs , like the invalid . what is quickly resolved , is that glover isn\'t queer , just manipulaive and mentally sick . that explanation of him doesn\'t ring true , anyway , as he always has a blank and uninteresting look on his face , and nothing he does throughout the film is taking him to anything but a dead-end . so it becomes hard to feel anything for him , but pity and contempt . the sadistic games are better delivered now with the presence of helen , but the menage a trois does

In [38]:
train.shape

(821, 5)

In [40]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): 
    return re_tok.sub(r' \1 ', s).split()


In [42]:
tfidf = TfidfVectorizer(ngram_range=(1,3), tokenizer=tokenize, stop_words="english", strip_accents='unicode', use_idf=True,
               smooth_idf=True, sublinear_tf=True)

In [43]:
train_vec = tfidf.fit_transform(train["subj_extraction"])
test_vec = tfidf.transform(test["subj_extraction"])

In [45]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [46]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r


In [None]:
preds = np.zeros(test.shape(0), 4))

for i, j in enumerate(["strongly neg","neg","pos","strongly pos"]):
    m,r = get_mdl(train["4class_label"])
    preds[:,i] = m.predict_proba(test_vec.multiply(r))[:,1]