In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from bs4 import BeautifulSoup
import re
import csv
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix
from sklearn.preprocessing import MultiLabelBinarizer
import pickle


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/diskane/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
nltk.download('stopwords')

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))


stemmer = SnowballStemmer("english")


In [14]:
def clean_text(text):
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

In [15]:
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
        stemSentence = stemSentence.strip()
        return stemSentence

In [16]:
metatsv = "dataset/movie.metadata.tsv"
plot_summaries = "dataset/plot_summaries.txt"

meta = pd.read_csv(metatsv, sep = '\t', header = None)
meta.columns = ["movie_id",1,"movie_name",3,4,5,6,7,"genre"]
genres = meta[["movie_id","movie_name","genre"]]
plots = pd.read_csv(plot_summaries, sep = '\t', header = None)
plots.columns = ["movie_id", "plot"]
genres['movie_id'] = genres['movie_id'].astype(str)
plots['movie_id'] = plots['movie_id'].astype(str)
movies = pd.merge(plots, genres, on = 'movie_id')
genres_lists = []

for i in movies['genre']:
    genres_lists.append(list(json.loads(i).values()))

movies['genre'] = genres_lists
movies['plot'] = movies['plot'].apply(clean_text)
movies['plot'] = movies['plot'].apply(stemming)
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit_transform(movies['genre'])
 # transform target variable
y = multilabel_binarizer.transform(movies['genre'])

for idx, genre in enumerate(multilabel_binarizer.classes_):
    movies[genre] = y[:,idx]
    
moviesCSV='dataset/movies.csv'
movies.to_csv(moviesCSV)
movies_new = pd.read_csv(moviesCSV)
movies = movies_new
movies.head(100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0.1,Unnamed: 0,movie_id,plot,movie_name,genre,Absurdism,Acid western,Action,Action Comedy,Action Thrillers,...,Werewolf fiction,Western,Whodunit,Women in prison films,Workplace Comedy,World History,World cinema,Wuxia,Z movie,Zombie Film
0,0,23890098,shlykov,Taxi Blues,"['Drama', 'World cinema']",0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,1,31186339,nation,The Hunger Games,"['Action/Adventure', 'Science Fiction', 'Actio...",0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,20663735,pooval,Narasimham,"['Musical', 'Action', 'Drama', 'Bollywood']",0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,2231378,lemon,The Lemon Drop Kid,"['Screwball comedy', 'Comedy']",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,595909,seventhday,A Cry in the Dark,"['Crime Fiction', 'Drama', 'Docudrama', 'World...",0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,17060199,clair,Expired,"['Romantic comedy', 'Indie', 'Comedy-drama', '...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,96,7481126,film,Classmates,"['Thriller', 'Mystery', 'Musical', 'Drama', 'C...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,97,9031450,young,Spring Bears Love,"['Romance Film', 'Comedy']",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,98,1520023,ninja,Ninja Resurrection,"['Horror', 'World cinema', 'Animation', 'Anime...",0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [18]:
train, test = train_test_split(movies, random_state=42, test_size=0.30, shuffle=True)
train_text = train['plot'].values.astype('U')
test_text = test['plot'].values.astype('U')

vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2', max_features = 10000)
vectorizer.fit(train_text)
vectorizer.fit(test_text)
x_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['movie_id', 'movie_name', 'plot', 'genre', 'Unnamed: 0'], axis=1)
x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['movie_id', 'movie_name', 'plot', 'genre', 'Unnamed: 0'], axis=1)

In [23]:
x_train

<29542x4698 sparse matrix of type '<class 'numpy.float64'>'
	with 21701 stored elements in Compressed Sparse Row format>

In [24]:
#Binary Relevance
br_classifier.fit(x_train, y_train)
br_predictions = br_classifier.predict(x_test)
print("Accuracy = ",accuracy_score(y_test,br_predictions.toarray()))
print("F1 score = ",F1_score(y_test,br_predictions, average="micro"))
print("Hamming loss = ",hamming_loss(y_test,br_predictions))

NameError: name 'br_classifier' is not defined

In [None]:
#Label Powerset
lp_classifier = LabelPowerset(LogisticRegression())
lp_classifier.fit(x_train, y_train)
lp_predictions = lp_classifier.predict(x_test)
print("Accuracy = ",accuracy_score(y_test,lp_predictions))
print("F1 score = ",f1_score(y_test,lp_predictions, average="micro"))
print("Hamming loss = ",hamming_loss(y_test,lp_predictions))



In [None]:
#MLkNN
ml_classifier = MLkNN(k=10)
# to prevent errors when handling sparse matrices.
x_train = lil_matrix(x_train).toarray()
y_train = lil_matrix(y_train).toarray()
x_test = lil_matrix(x_test).toarray()
ml_classifier.fit(x_train, y_train)
# predict
ml_predictions = ml_classifier.predict(x_test)
# accuracy
print("Accuracy = ",accuracy_score(y_test,ml_predictions))