In [2]:
from collections import defaultdict
from nltk.tokenize import WordPunctTokenizer        # splits all punctuations into separate tokens 
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

wnl = WordNetLemmatizer()
word_punct_tokenizer = WordPunctTokenizer()

def bow_movie_nltk(root,start,end):
    # list of dict where each element of bow_per_movie is bow for that movie
    bow_per_movie = [] 
    
    for i in range(start,end):
        bow = defaultdict(float)
        string = ""
        for j in range(1,len(root[i])):
            string += root[i][j].text

        tokens = word_punct_tokenizer.tokenize(string)
        l_tokens = map(lambda t: t.lower(), tokens)
        
        ### Lemmatizing using wordnetlemmatizer
        l_tokens = [wnl.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i) for i,j in pos_tag(l_tokens)]
        
        for token in l_tokens:
            bow[token] += 1.0
        bow_per_movie.append(bow)
    return bow_per_movie

## lemmatizing positive-negative words

def pos_tagging_for_list(l):
    new=[]
    for i,j in pos_tag(l):
        if j[0].lower() in ['a','n','v']:
            new.append(wnl.lemmatize(i,j[0].lower()))
        else:
            new.append(wnl.lemmatize(i))
    return set(new)
    

In [3]:
from sklearn.linear_model import Ridge
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
import scipy
import numpy as np
import numpy as np

def train(train_x,train_y,test_x):
    clf=DecisionTreeRegressor(max_depth=10)
    clf.fit(train_x, train_y)
    return clf.predict(test_x)

def cal_mae(y_hat,y):
    return np.mean(abs(y_hat-y))

# Function that returns a list of target variables i.e. revenue for all movies in the given set(train/dev/test) of given file root
def true_rev(start,end,root):
    rev = []
    for i in range(start,end):
        rev.append(root[i][0].attrib['yvalue'])
    rev=np.array(rev).astype(np.float)
    return rev 


def mpl(train_x,train_y,test_x):
    clf = MLPRegressor(hidden_layer_sizes=(300,300,300,300,300),alpha=0.01)
    clf.fit(train_x, train_y) 
    return clf.predict(test_x)


In [14]:
import xml.etree.ElementTree as ET
tree = ET.parse('dataset\\movies-data-v1.0\\movies-data-v1.0\\7domains-train-dev.tl.xml')
root_traindev_to = tree.getroot()

In [None]:
bow_per_movie_train = bow_movie_nltk(root_traindev_to,0,1147)
bow_per_movie_dev=bow_movie_nltk(root_traindev_to,1147,1464)

In [24]:
## uploading the positive/ negative words
pos_list=open('positive-words.txt','r').readlines()
neg_list=open('negative-words.txt','r').readlines()

## refining it
for i in range(len(pos_list)):
    pos_list[i]=pos_list[i].replace('\n','')
    
for i in range(len(neg_list)):
    neg_list[i]=neg_list[i].replace('\n','')
    
positive=[]
negative=[]

## lemmatizing it
positive=pos_tagging_for_list(pos_list)
negative=pos_tagging_for_list(neg_list)  

In [78]:
## create the feature vector based on the absense and presence of pos-neg words

feature=[]

for bow in bow_per_movie_train:
    feat=[]
    for pos in positive:
        if pos in bow.keys():
            feat.append(1)
        else:
            feat.append(0)
    for neg in negative:
        if neg in bow.keys():
            feat.append(-1)
        else:
            feat.append(0)
            
    feature.append(feat)

In [79]:
feature_dev=[]
for bow in bow_per_movie_dev:
    feat=[]
    for pos in positive:
        if pos in bow.keys():
            feat.append(1)
        else:
            feat.append(0)
    for neg in negative:
        if neg in bow.keys():
            feat.append(-1)
        else:
            feat.append(0)
            
    feature_dev.append(feat)   

In [152]:
f=open('train_y_to.txt', 'r')
train_y= pickle.load(f)

f=open('dev_y_to.txt', 'r')
test_y= pickle.load(f)

In [1]:
y_hat=mpl(feature,train_y,feature_dev)
print "MAE is ", cal_mae(test_y, y_hat)

NameError: name 'mpl' is not defined

In [113]:
import pickle
f=open('train_pos_feat.txt', 'w')
pickle.dump(feature,f)

In [114]:
f=open('dev_pos_feat.txt', 'w')
pickle.dump(feature_dev,f)