In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import nltk
import string

In [2]:
train_data = pd.read_csv('data_train.csv')
train_data.head()

Unnamed: 0.1,Unnamed: 0,id,plot_synopsis,tags
0,0,tt0057603,Note: this synopsis is for the orginal Italian...,"cult, horror, gothic, murder, atmospheric"
1,1,tt1733125,"Two thousand years ago, Nhagruul the Foul, a s...",violence
2,3,tt0113862,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good"
3,6,tt0249380,Baise-moi tells the story of Nadine and Manu w...,"gothic, cruelty, violence, cult, revenge, sadist"
4,7,tt0408790,Kyle Pratt (Jodie Foster) is a propulsion engi...,"mystery, suspenseful, action, murder, flashback"


In [3]:
final_test_data = pd.read_csv('data_test_all.csv')
final_test_data.head()

Unnamed: 0.1,Unnamed: 0,id,plot_synopsis
0,2,tt0033045,"Matuschek's, a gift store in Budapest, is the ..."
1,4,tt0086250,"In May 1980, a Cuban man named Tony Montana (A..."
2,5,tt1315981,George Falconer (Colin Firth) approaches a car...
3,15,tt1937113,Hours after the end of the previous game and t...
4,18,tt1619029,The film begins with a close-up of Jackie Kenn...


In [4]:
train_data['splitted_tags'] = train_data.tags.str.split(', ')
genres = pd.get_dummies(train_data['splitted_tags'].apply(pd.Series).stack(), prefix='is').sum(level=0)
genres.head()

Unnamed: 0,is_absurd,is_action,is_adult comedy,is_allegory,is_alternate history,is_alternate reality,is_anti war,is_atmospheric,is_autobiographical,is_avant garde,...,is_sentimental,is_storytelling,is_stupid,is_suicidal,is_suspenseful,is_thought-provoking,is_tragedy,is_violence,is_western,is_whimsical
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [5]:
train_processed = pd.DataFrame()
train_processed['is_murder'] = genres.is_murder
train_processed['is_romantic'] = genres.is_romantic
train_processed['is_comedy'] = genres.is_comedy
train_processed['is_fantasy'] = genres.is_fantasy
train_processed['is_flashback'] = genres.is_flashback

train_processed['plot_synopsis'] = train_data.plot_synopsis
train_processed['id'] = train_data.id

train_processed.head()

Unnamed: 0,is_murder,is_romantic,is_comedy,is_fantasy,is_flashback,plot_synopsis,id
0,1,0,0,0,0,Note: this synopsis is for the orginal Italian...,tt0057603
1,0,0,0,0,0,"Two thousand years ago, Nhagruul the Foul, a s...",tt1733125
2,0,1,0,0,0,"Glenn Holland, not a morning person by anyone'...",tt0113862
3,0,0,0,0,0,Baise-moi tells the story of Nadine and Manu w...,tt0249380
4,1,0,0,0,1,Kyle Pratt (Jodie Foster) is a propulsion engi...,tt0408790


In [6]:
train_processed['plot_synopsis'] = train_processed.plot_synopsis.str.lower()
train_processed['plot_synopsis'] = train_processed.plot_synopsis.str.translate(str.maketrans('', '', string.punctuation))
train_processed['plot_synopsis'] = train_processed.plot_synopsis.str.split()

train_processed.head()

Unnamed: 0,is_murder,is_romantic,is_comedy,is_fantasy,is_flashback,plot_synopsis,id
0,1,0,0,0,0,"[note, this, synopsis, is, for, the, orginal, ...",tt0057603
1,0,0,0,0,0,"[two, thousand, years, ago, nhagruul, the, fou...",tt1733125
2,0,1,0,0,0,"[glenn, holland, not, a, morning, person, by, ...",tt0113862
3,0,0,0,0,0,"[baisemoi, tells, the, story, of, nadine, and,...",tt0249380
4,1,0,0,0,1,"[kyle, pratt, jodie, foster, is, a, propulsion...",tt0408790


In [7]:
test_processed = pd.DataFrame()
test_processed['plot_synopsis'] = final_test_data.plot_synopsis
test_processed['id'] = final_test_data.id

test_processed.head()

Unnamed: 0,plot_synopsis,id
0,"Matuschek's, a gift store in Budapest, is the ...",tt0033045
1,"In May 1980, a Cuban man named Tony Montana (A...",tt0086250
2,George Falconer (Colin Firth) approaches a car...,tt1315981
3,Hours after the end of the previous game and t...,tt1937113
4,The film begins with a close-up of Jackie Kenn...,tt1619029


In [8]:
test_processed['plot_synopsis'] = test_processed.plot_synopsis.str.lower()
test_processed['plot_synopsis'] = test_processed.plot_synopsis.str.translate(str.maketrans('', '', string.punctuation))
test_processed['plot_synopsis'] = test_processed.plot_synopsis.str.split()

test_processed.head()

Unnamed: 0,plot_synopsis,id
0,"[matuscheks, a, gift, store, in, budapest, is,...",tt0033045
1,"[in, may, 1980, a, cuban, man, named, tony, mo...",tt0086250
2,"[george, falconer, colin, firth, approaches, a...",tt1315981
3,"[hours, after, the, end, of, the, previous, ga...",tt1937113
4,"[the, film, begins, with, a, closeup, of, jack...",tt1619029


In [9]:
eng_stops = nltk.corpus.stopwords.words('english')

In [10]:
train_processed['plot_synopsis'] = train_processed['plot_synopsis'].apply(lambda l: [word for word in l if word not in eng_stops])
train_processed.head()

Unnamed: 0,is_murder,is_romantic,is_comedy,is_fantasy,is_flashback,plot_synopsis,id
0,1,0,0,0,0,"[note, synopsis, orginal, italian, release, se...",tt0057603
1,0,0,0,0,0,"[two, thousand, years, ago, nhagruul, foul, so...",tt1733125
2,0,1,0,0,0,"[glenn, holland, morning, person, anyones, sta...",tt0113862
3,0,0,0,0,0,"[baisemoi, tells, story, nadine, manu, go, vio...",tt0249380
4,1,0,0,0,1,"[kyle, pratt, jodie, foster, propulsion, engin...",tt0408790


In [11]:
test_processed['plot_synopsis'] = test_processed['plot_synopsis'].apply(lambda l: [word for word in l if word not in eng_stops])
test_processed.head()

Unnamed: 0,plot_synopsis,id
0,"[matuscheks, gift, store, budapest, workplace,...",tt0033045
1,"[may, 1980, cuban, man, named, tony, montana, ...",tt0086250
2,"[george, falconer, colin, firth, approaches, c...",tt1315981
3,"[hours, end, previous, game, death, traitorous...",tt1937113
4,"[film, begins, closeup, jackie, kennedy, natal...",tt1619029


In [12]:
stemmer = nltk.stem.snowball.SnowballStemmer("english")

In [13]:
train_processed['plot_synopsis'] = train_processed['plot_synopsis'].apply(lambda l: [stemmer.stem(word) for word in l])
train_processed.head()

Unnamed: 0,is_murder,is_romantic,is_comedy,is_fantasy,is_flashback,plot_synopsis,id
0,1,0,0,0,0,"[note, synopsi, orgin, italian, releas, segmen...",tt0057603
1,0,0,0,0,0,"[two, thousand, year, ago, nhagruul, foul, sor...",tt1733125
2,0,1,0,0,0,"[glenn, holland, morn, person, anyon, standard...",tt0113862
3,0,0,0,0,0,"[baisemoi, tell, stori, nadin, manu, go, viole...",tt0249380
4,1,0,0,0,1,"[kyle, pratt, jodi, foster, propuls, engin, ba...",tt0408790


In [14]:
test_processed['plot_synopsis'] = test_processed['plot_synopsis'].apply(lambda l: [stemmer.stem(word) for word in l])
test_processed.head()

Unnamed: 0,plot_synopsis,id
0,"[matuschek, gift, store, budapest, workplac, a...",tt0033045
1,"[may, 1980, cuban, man, name, toni, montana, a...",tt0086250
2,"[georg, falcon, colin, firth, approach, car, a...",tt1315981
3,"[hour, end, previous, game, death, traitor, ge...",tt1937113
4,"[film, begin, closeup, jacki, kennedi, natali,...",tt1619029


In [15]:
lemmatizer = nltk.stem.WordNetLemmatizer()

In [16]:
train_processed['plot_synopsis'] = train_processed['plot_synopsis'].apply(lambda l: [lemmatizer.lemmatize(word) for word in l])
train_processed.head()

Unnamed: 0,is_murder,is_romantic,is_comedy,is_fantasy,is_flashback,plot_synopsis,id
0,1,0,0,0,0,"[note, synopsi, orgin, italian, releas, segmen...",tt0057603
1,0,0,0,0,0,"[two, thousand, year, ago, nhagruul, foul, sor...",tt1733125
2,0,1,0,0,0,"[glenn, holland, morn, person, anyon, standard...",tt0113862
3,0,0,0,0,0,"[baisemoi, tell, stori, nadin, manu, go, viole...",tt0249380
4,1,0,0,0,1,"[kyle, pratt, jodi, foster, propuls, engin, ba...",tt0408790


In [17]:
test_processed['plot_synopsis'] = test_processed['plot_synopsis'].apply(lambda l: [lemmatizer.lemmatize(word) for word in l])
test_processed.head()

Unnamed: 0,plot_synopsis,id
0,"[matuschek, gift, store, budapest, workplac, a...",tt0033045
1,"[may, 1980, cuban, man, name, toni, montana, a...",tt0086250
2,"[georg, falcon, colin, firth, approach, car, a...",tt1315981
3,"[hour, end, previous, game, death, traitor, ge...",tt1937113
4,"[film, begin, closeup, jacki, kennedi, natali,...",tt1619029


In [18]:
from nltk import pos_tag

In [19]:
def verbs(l):
    tag = ['VB', 'VBD', 'VBG', 'VBN']
    tagged_list = pos_tag(l)
    #print(tagged_list)
    tagged_words = []
    for word, pos in tagged_list:
        if pos in tag:
            tagged_words.append(word)
    return ' '.join(tagged_words)


train_processed['verbs'] = train_processed['plot_synopsis'].apply(verbs)
train_processed.head()

Unnamed: 0,is_murder,is_romantic,is_comedy,is_fantasy,is_flashback,plot_synopsis,id,verbs
0,1,0,0,0,0,"[note, synopsi, orgin, italian, releas, segmen...",tt0057603,known beset alfonsi estrang mari frank frank p...
1,0,0,0,0,0,"[two, thousand, year, ago, nhagruul, foul, sor...",tt1733125,dismay sold surviv blood rose knight gave knig...
2,0,1,0,0,0,"[glenn, holland, morn, person, anyon, standard...",tt0113862,taken find timeconsum princip wrote helen got ...
3,0,0,0,0,0,"[baisemoi, tell, stori, nadin, manu, go, viole...",tt0249380,feel get manu detach act detach happen get tel...
4,1,0,0,0,1,"[kyle, pratt, jodi, foster, propuls, engin, ba...",tt0408790,marlen design find begin seen hurt crew crew f...


In [20]:
test_processed['verbs'] = test_processed['plot_synopsis'].apply(verbs)
test_processed.head()

Unnamed: 0,plot_synopsis,id,verbs
0,"[matuschek, gift, store, budapest, workplac, a...",tt0033045,sent meet frank best lost find chosen hed hurt...
1,"[may, 1980, cuban, man, name, toni, montana, a...",tt0086250,left steven kill kill hit set crew met set ang...
2,"[georg, falcon, colin, firth, approach, car, a...",tt1315981,snowwhit kiss ring told feel tell given given ...
3,"[hour, end, previous, game, death, traitor, ge...",tt1937113,forc fled safetymeanwhil led william take figh...
4,"[film, begin, closeup, jacki, kennedi, natali,...",tt1619029,begin told arriv impress broadcasterw go done ...


In [21]:
def nouns(l):
    tag = ['NN', 'NNP', 'NNS']
    tagged_list = pos_tag(l)
    #print(tagged_list)
    tagged_words = []
    for word, pos in tagged_list:
        if pos in tag:
            tagged_words.append(word)
    return ' '.join(tagged_words)


train_processed['nouns'] = train_processed['plot_synopsis'].apply(nouns)
train_processed.head()

Unnamed: 0,is_murder,is_romantic,is_comedy,is_fantasy,is_flashback,plot_synopsis,id,verbs,nouns
0,1,0,0,0,0,"[note, synopsi, orgin, italian, releas, segmen...",tt0057603,known beset alfonsi estrang mari frank frank p...,note synopsi orgin releas segment karloff intr...
1,0,0,0,0,0,"[two, thousand, year, ago, nhagruul, foul, sor...",tt1733125,dismay sold surviv blood rose knight gave knig...,year foul sorcer revel innoc spread despair da...
2,0,1,0,0,0,"[glenn, holland, morn, person, anyon, standard...",tt0113862,taken find timeconsum princip wrote helen got ...,glenn person standard wife iri septemb morn gl...
3,0,0,0,0,0,"[baisemoi, tell, stori, nadin, manu, go, viole...",tt0249380,feel get manu detach act detach happen get tel...,baisemoi tell nadin manu spree societi margin ...
4,1,0,0,0,1,"[kyle, pratt, jodi, foster, propuls, engin, ba...",tt0408790,marlen design find begin seen hurt crew crew f...,pratt jodi foster base berlin germani husband ...


In [22]:
test_processed['nouns'] = test_processed['plot_synopsis'].apply(nouns)
test_processed.head()

Unnamed: 0,plot_synopsis,id,verbs,nouns
0,"[matuschek, gift, store, budapest, workplac, a...",tt0033045,sent meet frank best lost find chosen hed hurt...,matuschek gift store workplac alfr kralik jame...
1,"[may, 1980, cuban, man, name, toni, montana, a...",tt0086250,left steven kill kill hit set crew met set ang...,cuban man name montana al pacino claim asylum ...
2,"[georg, falcon, colin, firth, approach, car, a...",tt1315981,snowwhit kiss ring told feel tell given given ...,falcon colin firth approach car accid middl bl...
3,"[hour, end, previous, game, death, traitor, ge...",tt1937113,forc fled safetymeanwhil led william take figh...,hour end game death traitor shepard remnant ta...
4,"[film, begin, closeup, jacki, kennedi, natali,...",tt1619029,begin told arriv impress broadcasterw go done ...,film closeup jacki kennedi natali portman port...


In [23]:
def adjectives(l):
    tag = ['JJ', 'JJR', 'JJS']
    tagged_list = pos_tag(l)
    #print(tagged_list)
    tagged_words = []
    for word, pos in tagged_list:
        if pos in tag:
            tagged_words.append(word)
    return ' '.join(tagged_words)


train_processed['adjectives'] = train_processed['plot_synopsis'].apply(adjectives)
train_processed.head()

Unnamed: 0,is_murder,is_romantic,is_comedy,is_fantasy,is_flashback,plot_synopsis,id,verbs,nouns,adjectives
0,1,0,0,0,0,"[note, synopsi, orgin, italian, releas, segmen...",tt0057603,known beset alfonsi estrang mari frank frank p...,note synopsi orgin releas segment karloff intr...,italian certain orderbori telephonerosi attrac...
1,0,0,0,0,0,"[two, thousand, year, ago, nhagruul, foul, sor...",tt1733125,dismay sold surviv blood rose knight gave knig...,year foul sorcer revel innoc spread despair da...,nhagruul end mortal consum excruci ritual beca...
2,0,1,0,0,0,"[glenn, holland, morn, person, anyon, standard...",tt0113862,taken find timeconsum princip wrote helen got ...,glenn person standard wife iri septemb morn gl...,morn anyon bright newli high musician free fir...
3,0,0,0,0,0,"[baisemoi, tell, stori, nadin, manu, go, viole...",tt0249380,feel get manu detach act detach happen get tel...,baisemoi tell nadin manu spree societi margin ...,violent parttim small southern friend troubl g...
4,1,0,0,0,1,"[kyle, pratt, jodi, foster, propuls, engin, ba...",tt0408790,marlen design find begin seen hurt crew crew f...,pratt jodi foster base berlin germani husband ...,kyle engin build yearold lawston buri kyle kyl...


In [24]:
test_processed['adjectives'] = test_processed['plot_synopsis'].apply(adjectives)
test_processed.head()

Unnamed: 0,plot_synopsis,id,verbs,nouns,adjectives
0,"[matuschek, gift, store, budapest, workplac, a...",tt0033045,sent meet frank best lost find chosen hed hurt...,matuschek gift store workplac alfr kralik jame...,budapest stewart novak margaret constant secre...
1,"[may, 1980, cuban, man, name, toni, montana, a...",tt0086250,left steven kill kill hit set crew met set ang...,cuban man name montana al pacino claim asylum ...,toni usa american offici notic arm black ident...
2,"[georg, falcon, colin, firth, approach, car, a...",tt1315981,snowwhit kiss ring told feel tell given given ...,falcon colin firth approach car accid middl bl...,georg sceneri wake good fate fatal homophobia ...
3,"[hour, end, previous, game, death, traitor, ge...",tt1937113,forc fled safetymeanwhil led william take figh...,hour end game death traitor shepard remnant ta...,previous general john mactavish afghanistan sa...
4,"[film, begin, closeup, jacki, kennedi, natali,...",tt1619029,begin told arriv impress broadcasterw go done ...,film closeup jacki kennedi natali portman port...,hyanni novemb temporarili live sorri realli ho...


In [25]:
def numeral(l):
    tag = ['CD']
    tagged_list = pos_tag(l)
    #print(tagged_list)
    tagged_words = []
    for word, pos in tagged_list:
        if pos in tag:
            tagged_words.append(word)
    return ' '.join(tagged_words)


train_processed['numeral'] = train_processed['plot_synopsis'].apply(numeral)
train_processed.head()

Unnamed: 0,is_murder,is_romantic,is_comedy,is_fantasy,is_flashback,plot_synopsis,id,verbs,nouns,adjectives,numeral
0,1,0,0,0,0,"[note, synopsi, orgin, italian, releas, segmen...",tt0057603,known beset alfonsi estrang mari frank frank p...,note synopsi orgin releas segment karloff intr...,italian certain orderbori telephonerosi attrac...,three three two one one one two 19th one one f...
1,0,0,0,0,0,"[two, thousand, year, ago, nhagruul, foul, sor...",tt1733125,dismay sold surviv blood rose knight gave knig...,year foul sorcer revel innoc spread despair da...,nhagruul end mortal consum excruci ritual beca...,two thousand hundr three
2,0,1,0,0,0,"[glenn, holland, morn, person, anyon, standard...",tt0113862,taken find timeconsum princip wrote helen got ...,glenn person standard wife iri septemb morn gl...,morn anyon bright newli high musician free fir...,one 1964 four one 1965 one 60 70 three one 197...
3,0,0,0,0,0,"[baisemoi, tell, stori, nadin, manu, go, viole...",tt0249380,feel get manu detach act detach happen get tel...,baisemoi tell nadin manu spree societi margin ...,violent parttim small southern friend troubl g...,one three two one
4,1,0,0,0,1,"[kyle, pratt, jodi, foster, propuls, engin, ba...",tt0408790,marlen design find begin seen hurt crew crew f...,pratt jodi foster base berlin germani husband ...,kyle engin build yearold lawston buri kyle kyl...,six 474 one one two one 50000000 one


In [26]:
test_processed['numeral'] = test_processed['plot_synopsis'].apply(numeral)
test_processed.head()

Unnamed: 0,plot_synopsis,id,verbs,nouns,adjectives,numeral
0,"[matuschek, gift, store, budapest, workplac, a...",tt0033045,sent meet frank best lost find chosen hed hurt...,matuschek gift store workplac alfr kralik jame...,budapest stewart novak margaret constant secre...,one 1928
1,"[may, 1980, cuban, man, name, toni, montana, a...",tt0086250,left steven kill kill hit set crew met set ang...,cuban man name montana al pacino claim asylum ...,toni usa american offici notic arm black ident...,1980 1980 three 30 500 1000 two 25000 5000 two...
2,"[georg, falcon, colin, firth, approach, car, a...",tt1315981,snowwhit kiss ring told feel tell given given ...,falcon colin firth approach car accid middl bl...,georg sceneri wake good fate fatal homophobia ...,16 30 1962 one one 1946
3,"[hour, end, previous, game, death, traitor, ge...",tt1937113,forc fled safetymeanwhil led william take figh...,hour end game death traitor shepard remnant ta...,previous general john mactavish afghanistan sa...,141 one two one 911 one thousand zone zakhaev ...
4,"[film, begin, closeup, jacki, kennedi, natali,...",tt1619029,begin told arriv impress broadcasterw go done ...,film closeup jacki kennedi natali portman port...,hyanni novemb temporarili live sorri realli ho...,1963 one 1962 56 million 60 one one one three ...


In [27]:
train_processed['plot'] = train_processed['plot_synopsis'].apply(lambda l: ' '.join(l))
train_processed.head()

Unnamed: 0,is_murder,is_romantic,is_comedy,is_fantasy,is_flashback,plot_synopsis,id,verbs,nouns,adjectives,numeral,plot
0,1,0,0,0,0,"[note, synopsi, orgin, italian, releas, segmen...",tt0057603,known beset alfonsi estrang mari frank frank p...,note synopsi orgin releas segment karloff intr...,italian certain orderbori telephonerosi attrac...,three three two one one one two 19th one one f...,note synopsi orgin italian releas segment cert...
1,0,0,0,0,0,"[two, thousand, year, ago, nhagruul, foul, sor...",tt1733125,dismay sold surviv blood rose knight gave knig...,year foul sorcer revel innoc spread despair da...,nhagruul end mortal consum excruci ritual beca...,two thousand hundr three,two thousand year ago nhagruul foul sorcer rev...
2,0,1,0,0,0,"[glenn, holland, morn, person, anyon, standard...",tt0113862,taken find timeconsum princip wrote helen got ...,glenn person standard wife iri septemb morn gl...,morn anyon bright newli high musician free fir...,one 1964 four one 1965 one 60 70 three one 197...,glenn holland morn person anyon standard woken...
3,0,0,0,0,0,"[baisemoi, tell, stori, nadin, manu, go, viole...",tt0249380,feel get manu detach act detach happen get tel...,baisemoi tell nadin manu spree societi margin ...,violent parttim small southern friend troubl g...,one three two one,baisemoi tell stori nadin manu go violent spre...
4,1,0,0,0,1,"[kyle, pratt, jodi, foster, propuls, engin, ba...",tt0408790,marlen design find begin seen hurt crew crew f...,pratt jodi foster base berlin germani husband ...,kyle engin build yearold lawston buri kyle kyl...,six 474 one one two one 50000000 one,kyle pratt jodi foster propuls engin base berl...


In [28]:
test_processed['plot'] = test_processed['plot_synopsis'].apply(lambda l: ' '.join(l))
test_processed.head()

Unnamed: 0,plot_synopsis,id,verbs,nouns,adjectives,numeral,plot
0,"[matuschek, gift, store, budapest, workplac, a...",tt0033045,sent meet frank best lost find chosen hed hurt...,matuschek gift store workplac alfr kralik jame...,budapest stewart novak margaret constant secre...,one 1928,matuschek gift store budapest workplac alfr kr...
1,"[may, 1980, cuban, man, name, toni, montana, a...",tt0086250,left steven kill kill hit set crew met set ang...,cuban man name montana al pacino claim asylum ...,toni usa american offici notic arm black ident...,1980 1980 three 30 500 1000 two 25000 5000 two...,may 1980 cuban man name toni montana al pacino...
2,"[georg, falcon, colin, firth, approach, car, a...",tt1315981,snowwhit kiss ring told feel tell given given ...,falcon colin firth approach car accid middl bl...,georg sceneri wake good fate fatal homophobia ...,16 30 1962 one one 1946,georg falcon colin firth approach car accid mi...
3,"[hour, end, previous, game, death, traitor, ge...",tt1937113,forc fled safetymeanwhil led william take figh...,hour end game death traitor shepard remnant ta...,previous general john mactavish afghanistan sa...,141 one two one 911 one thousand zone zakhaev ...,hour end previous game death traitor general s...
4,"[film, begin, closeup, jacki, kennedi, natali,...",tt1619029,begin told arriv impress broadcasterw go done ...,film closeup jacki kennedi natali portman port...,hyanni novemb temporarili live sorri realli ho...,1963 one 1962 56 million 60 one one one three ...,film begin closeup jacki kennedi natali portma...


In [29]:
from nltk.tokenize import RegexpTokenizer

In [30]:
def tokenizer(s):
    t = RegexpTokenizer(r'\w+')
    return t.tokenize(s)

In [31]:
train_feat = train_processed.copy()
train_feat = train_feat.drop(['is_murder','is_romantic','is_comedy','is_fantasy', 'is_flashback', 'id', 'plot_synopsis'],axis=1)

train_feat.head()

Unnamed: 0,verbs,nouns,adjectives,numeral,plot
0,known beset alfonsi estrang mari frank frank p...,note synopsi orgin releas segment karloff intr...,italian certain orderbori telephonerosi attrac...,three three two one one one two 19th one one f...,note synopsi orgin italian releas segment cert...
1,dismay sold surviv blood rose knight gave knig...,year foul sorcer revel innoc spread despair da...,nhagruul end mortal consum excruci ritual beca...,two thousand hundr three,two thousand year ago nhagruul foul sorcer rev...
2,taken find timeconsum princip wrote helen got ...,glenn person standard wife iri septemb morn gl...,morn anyon bright newli high musician free fir...,one 1964 four one 1965 one 60 70 three one 197...,glenn holland morn person anyon standard woken...
3,feel get manu detach act detach happen get tel...,baisemoi tell nadin manu spree societi margin ...,violent parttim small southern friend troubl g...,one three two one,baisemoi tell stori nadin manu go violent spre...
4,marlen design find begin seen hurt crew crew f...,pratt jodi foster base berlin germani husband ...,kyle engin build yearold lawston buri kyle kyl...,six 474 one one two one 50000000 one,kyle pratt jodi foster propuls engin base berl...


In [32]:
train_val = train_processed.copy()
train_val = train_val.drop(['plot_synopsis', 'id', 'verbs', 'nouns', 'adjectives', 'numeral', 'plot'],axis=1)
train_val.head()

Unnamed: 0,is_murder,is_romantic,is_comedy,is_fantasy,is_flashback
0,1,0,0,0,0
1,0,0,0,0,0
2,0,1,0,0,0
3,0,0,0,0,0
4,1,0,0,0,1


In [33]:
train_id = train_processed['id']
train_id.head()

0    tt0057603
1    tt1733125
2    tt0113862
3    tt0249380
4    tt0408790
Name: id, dtype: object

In [34]:
test_feat = test_processed.copy()
test_feat = test_feat.drop(['id', 'plot_synopsis'],axis=1)
test_feat.head()

Unnamed: 0,verbs,nouns,adjectives,numeral,plot
0,sent meet frank best lost find chosen hed hurt...,matuschek gift store workplac alfr kralik jame...,budapest stewart novak margaret constant secre...,one 1928,matuschek gift store budapest workplac alfr kr...
1,left steven kill kill hit set crew met set ang...,cuban man name montana al pacino claim asylum ...,toni usa american offici notic arm black ident...,1980 1980 three 30 500 1000 two 25000 5000 two...,may 1980 cuban man name toni montana al pacino...
2,snowwhit kiss ring told feel tell given given ...,falcon colin firth approach car accid middl bl...,georg sceneri wake good fate fatal homophobia ...,16 30 1962 one one 1946,georg falcon colin firth approach car accid mi...
3,forc fled safetymeanwhil led william take figh...,hour end game death traitor shepard remnant ta...,previous general john mactavish afghanistan sa...,141 one two one 911 one thousand zone zakhaev ...,hour end previous game death traitor general s...
4,begin told arriv impress broadcasterw go done ...,film closeup jacki kennedi natali portman port...,hyanni novemb temporarili live sorri realli ho...,1963 one 1962 56 million 60 one one one three ...,film begin closeup jacki kennedi natali portma...


In [35]:
test_id = test_processed['id']
test_id.head()

0    tt0033045
1    tt0086250
2    tt1315981
3    tt1937113
4    tt1619029
Name: id, dtype: object

In [36]:
from sklearn.model_selection import train_test_split

In [37]:
'''

train_feat - all train features
train_val - all train genres (validation)
train_id - train id

X_train - 0.7 train features
X_test - 0.3 train features for estimation 
y_train - 0.7 train valid
y_test - 0.3 train valid for estimation

test_feat - final test features
test_id - final test ids

'''

'\n\ntrain_feat - all train features\ntrain_val - all train genres (validation)\ntrain_id - train id\n\nX_train - 0.7 train features\nX_test - 0.3 train features for estimation \ny_train - 0.7 train valid\ny_test - 0.3 train valid for estimation\n\ntest_feat - final test features\ntest_id - final test ids\n\n'

In [38]:
X_train, X_test, y_train_y, y_test_y = train_test_split(train_feat, train_val, test_size=0.3)

In [39]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [40]:
cv = CountVectorizer(
    max_features=20000,
    tokenizer=tokenizer
)

cv_ngram_binary = CountVectorizer(
    ngram_range=[1, 2],
    max_features=40000,
    tokenizer=tokenizer,
    binary=True
)

tfidf = TfidfVectorizer(
    max_features=30000
)

tfidf_maxdf = TfidfVectorizer(
    max_df=0.8,
    max_features=10000
)


tfidf_bigr = TfidfVectorizer(
    max_df=0.8,
    max_features=30000,
    tokenizer=tokenizer,
    ngram_range=[1, 2]
)

In [41]:
X_train_vector = {}
X_test_vector = {}
X_train_all_global = {}
X_test_all_vector = {}

In [42]:
for feature in ['plot', 'verbs', 'nouns', 'adjectives', 'numeral']:
    
    print(feature + ' for cv vectorizer')
    cv.fit(X_train[feature])
    X_train_vector[f'cv_{feature}'] = cv.transform(X_train[feature])
    X_test_vector[f'cv_{feature}'] = cv.transform(X_test[feature])
    X_train_all_global[f'cv_{feature}'] = cv.transform(train_feat[feature])
    X_test_all_vector[f'cv_{feature}'] = cv.transform(test_feat[feature])
    
    print(feature + ' for cv_ngram_binary vectorizer')
    cv_ngram_binary.fit(X_train[feature])
    X_train_vector[f'cv_ngram_binary_{feature}'] = cv_ngram_binary.transform(X_train[feature])
    X_test_vector[f'cv_ngram_binary_{feature}'] = cv_ngram_binary.transform(X_test[feature])
    X_train_all_global[f'cv_ngram_binary_{feature}'] = cv_ngram_binary.transform(train_feat[feature])
    X_test_all_vector[f'cv_ngram_binary_{feature}'] = cv_ngram_binary.transform(test_feat[feature])

    print(feature + ' for tfidf vectorizer')
    tfidf.fit(X_train[feature])
    X_train_vector[f'tfidf_{feature}'] = tfidf.transform(X_train[feature])
    X_test_vector[f'tfidf_{feature}'] = tfidf.transform(X_test[feature])
    X_train_all_global[f'tfidf_{feature}'] = tfidf.transform(train_feat[feature])
    X_test_all_vector[f'tfidf_{feature}'] = tfidf.transform(test_feat[feature])
    
    print(feature + ' for tfidf_maxdf vectorizer')
    tfidf_maxdf.fit(X_train[feature])
    X_train_vector[f'tfidf_maxdf_{feature}'] = tfidf_maxdf.transform(X_train[feature])
    X_test_vector[f'tfidf_maxdf_{feature}'] = tfidf_maxdf.transform(X_test[feature])
    X_train_all_global[f'tfidf_maxdf_{feature}'] = tfidf_maxdf.transform(train_feat[feature])
    X_test_all_vector[f'tfidf_maxdf_{feature}'] = tfidf_maxdf.transform(test_feat[feature])
    
    print(feature + ' for tfidf_bigr vectorizer')
    tfidf_bigr.fit(X_train[feature])
    X_train_vector[f'tfidf_bigr_{feature}'] = tfidf_bigr.transform(X_train[feature])
    X_test_vector[f'tfidf_bigr_{feature}'] = tfidf_bigr.transform(X_test[feature])
    X_train_all_global[f'tfidf_bigr_{feature}'] = tfidf_bigr.transform(train_feat[feature])
    X_test_all_vector[f'tfidf_bigr_{feature}'] = tfidf_bigr.transform(test_feat[feature])

plot for cv vectorizer




plot for cv_ngram_binary vectorizer
plot for tfidf vectorizer
plot for tfidf_maxdf vectorizer
plot for tfidf_bigr vectorizer
verbs for cv vectorizer
verbs for cv_ngram_binary vectorizer
verbs for tfidf vectorizer
verbs for tfidf_maxdf vectorizer
verbs for tfidf_bigr vectorizer
nouns for cv vectorizer
nouns for cv_ngram_binary vectorizer
nouns for tfidf vectorizer
nouns for tfidf_maxdf vectorizer
nouns for tfidf_bigr vectorizer
adjectives for cv vectorizer
adjectives for cv_ngram_binary vectorizer
adjectives for tfidf vectorizer
adjectives for tfidf_maxdf vectorizer
adjectives for tfidf_bigr vectorizer
numeral for cv vectorizer
numeral for cv_ngram_binary vectorizer
numeral for tfidf vectorizer
numeral for tfidf_maxdf vectorizer
numeral for tfidf_bigr vectorizer


In [43]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report 



In [44]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [45]:
def fit_for_genre(genre, vector, feature, classifier):
    y_train = y_train_y[genre].values  # check train
    y_val = y_test_y[genre].values  # check result
    y_train_all = train_val[genre].values  # all train
    
    X_tr = X_train_vector[f'{vector}_{feature}']  # check train
    X_v = X_test_vector[f'{vector}_{feature}']  # check result
    X_train_all = X_train_all_global[f'{vector}_{feature}']  # all train
    X_test_all = X_test_all_vector[f'{vector}_{feature}']  # ansver
    
    clf.fit(X_tr, y_train)
    y_val_pred = clf.predict(X_v)
    
    clf.fit(X_train_all, y_train_all)
    y_test_pred = clf.predict(X_test_all)  # ans
    
    return y_train, y_val, y_val_pred, y_test_pred

'''

X_train_vector = {}
X_test_vector = {}
X_train_all_global = {}
X_test_all_vector = {}

train_feat - all train features
train_val - all train genres (validation)
train_id - train id

X_train - 0.7 train features
X_test - 0.3 train features for estimation 
y_train - 0.7 train valid
y_test - 0.3 train valid for estimation

test_feat - final test features
test_id - final test ids

'''

'\n\nX_train_vector = {}\nX_test_vector = {}\nX_train_all_global = {}\nX_test_all_vector = {}\n\ntrain_feat - all train features\ntrain_val - all train genres (validation)\ntrain_id - train id\n\nX_train - 0.7 train features\nX_test - 0.3 train features for estimation \ny_train - 0.7 train valid\ny_test - 0.3 train valid for estimation\n\ntest_feat - final test features\ntest_id - final test ids\n\n'

In [46]:
from copy import deepcopy

In [52]:
def train_with_settings(genres, vector_stra, feature_stra, classifier):
    output = {}
    
    for genre in genres:
        for vect_strategy in vect_strategies:
            for feature in feature_strategies:
                print(f'{genre} TRAIN FOR {vect_strategy} and {feature}')

                y_train, y_val, y_val_pred, y_test_pred = fit_for_genre(genre, vect_strategy, feature, clf)

                print('Test: ', f1_score(y_val, y_val_pred))
                print('Test: ', confusion_matrix(y_val, y_val_pred))
                print('===========================================')
                
                output[genre+'_'+vect_strategy+'_'+feature+'_'+str(type(clf))] = {
                    'y_train': y_train,
                    'y_val': y_val,
                    'y_val_pred': y_val_pred,
                    'y_test_all_pred': y_test_pred,
                    'f1_test': f1_score(y_val, y_val_pred),
                    'cm_test': confusion_matrix(y_val, y_val_pred),
                    'clf': deepcopy(clf),
                    'key': genre+'_'+vect_strategy+'_'+feature+'_'+str(type(clf))
                }
    return output

In [53]:
genres = ['is_murder', 'is_romantic', 'is_comedy', 'is_fantasy', 'is_flashback']
vect_strategies = ['cv_ngram_binary', 'cv', 'tfidf', 'tfidf_maxdf', 'tfidf_bigr']
feature_strategies = ['plot', 'verbs', 'nouns', 'adjectives', 'numeral']

In [54]:
clf = DecisionTreeClassifier(criterion='gini', max_depth = 39, random_state=1)
output_dec_trees = train_with_settings(genres, vect_strategies, feature_strategies, clf)

is_murder TRAIN FOR cv_ngram_binary and plot
Test:  0.5903508771929825
Test:  [[1240  470]
 [ 464  673]]
is_murder TRAIN FOR cv_ngram_binary and verbs
Test:  0.44658119658119655
Test:  [[1393  317]
 [ 719  418]]
is_murder TRAIN FOR cv_ngram_binary and nouns
Test:  0.5716886663632226
Test:  [[1278  432]
 [ 509  628]]
is_murder TRAIN FOR cv_ngram_binary and adjectives
Test:  0.4911749873928391
Test:  [[1351  359]
 [ 650  487]]
is_murder TRAIN FOR cv_ngram_binary and numeral
Test:  0.24657534246575344
Test:  [[1503  207]
 [ 948  189]]
is_murder TRAIN FOR cv and plot
Test:  0.5919642857142857
Test:  [[1270  440]
 [ 474  663]]
is_murder TRAIN FOR cv and verbs
Test:  0.5084581923634606
Test:  [[1304  406]
 [ 611  526]]
is_murder TRAIN FOR cv and nouns
Test:  0.5851254480286738
Test:  [[1268  442]
 [ 484  653]]
is_murder TRAIN FOR cv and adjectives
Test:  0.493621197252208
Test:  [[1312  398]
 [ 634  503]]
is_murder TRAIN FOR cv and numeral
Test:  0.251621271076524
Test:  [[1499  211]
 [ 943 

In [55]:
clf = MultinomialNB()
output_mnb = train_with_settings(genres, vect_strategies, feature_strategies, clf)

is_murder TRAIN FOR cv_ngram_binary and plot
Test:  0.6597582037996547
Test:  [[1295  415]
 [ 373  764]]
is_murder TRAIN FOR cv_ngram_binary and verbs
Test:  0.4967837704106878
Test:  [[1328  382]
 [ 635  502]]
is_murder TRAIN FOR cv_ngram_binary and nouns
Test:  0.6507177033492823
Test:  [[1296  414]
 [ 389  748]]
is_murder TRAIN FOR cv_ngram_binary and adjectives
Test:  0.5765091249415067
Test:  [[1326  384]
 [ 521  616]]
is_murder TRAIN FOR cv_ngram_binary and numeral
Test:  0.18931710615280595
Test:  [[1508  202]
 [ 997  140]]
is_murder TRAIN FOR cv and plot
Test:  0.6357446808510638
Test:  [[1244  466]
 [ 390  747]]
is_murder TRAIN FOR cv and verbs
Test:  0.502906976744186
Test:  [[1302  408]
 [ 618  519]]
is_murder TRAIN FOR cv and nouns
Test:  0.6309523809523809
Test:  [[1237  473]
 [ 395  742]]
is_murder TRAIN FOR cv and adjectives
Test:  0.5885439440314824
Test:  [[1233  477]
 [ 464  673]]
is_murder TRAIN FOR cv and numeral
Test:  0.21914191419141915
Test:  [[1498  212]
 [ 971

In [56]:
clf = KNeighborsClassifier(n_neighbors=7)
output_knn = train_with_settings(genres, vect_strategies, feature_strategies, clf)

is_murder TRAIN FOR cv_ngram_binary and plot
Test:  0.20067340067340064
Test:  [[1511  199]
 [ 988  149]]
is_murder TRAIN FOR cv_ngram_binary and verbs
Test:  0.05382674516400337
Test:  [[1690   20]
 [1105   32]]
is_murder TRAIN FOR cv_ngram_binary and nouns
Test:  0.11018867924528301
Test:  [[1595  115]
 [1064   73]]
is_murder TRAIN FOR cv_ngram_binary and adjectives
Test:  0.49605411499436314
Test:  [[846 864]
 [477 660]]
is_murder TRAIN FOR cv_ngram_binary and numeral
Test:  0.3317925012840267
Test:  [[1223  487]
 [ 814  323]]
is_murder TRAIN FOR cv and plot
Test:  0.4667492771581991
Test:  [[991 719]
 [572 565]]
is_murder TRAIN FOR cv and verbs
Test:  0.417448923246825
Test:  [[1414  296]
 [ 759  378]]
is_murder TRAIN FOR cv and nouns
Test:  0.3402777777777778
Test:  [[1413  297]
 [ 843  294]]
is_murder TRAIN FOR cv and adjectives
Test:  0.4584864070536371
Test:  [[749 961]
 [513 624]]
is_murder TRAIN FOR cv and numeral
Test:  0.2825059101654846
Test:  [[1394  316]
 [ 898  239]]
is

In [57]:
clf = RandomForestClassifier()
output_rfc = train_with_settings(genres, vect_strategies, feature_strategies, clf)

is_murder TRAIN FOR cv_ngram_binary and plot
Test:  0.5867082035306335
Test:  [[1486  224]
 [ 572  565]]
is_murder TRAIN FOR cv_ngram_binary and verbs
Test:  0.41428571428571426
Test:  [[1515  195]
 [ 789  348]]
is_murder TRAIN FOR cv_ngram_binary and nouns
Test:  0.5697612732095491
Test:  [[1499  211]
 [ 600  537]]
is_murder TRAIN FOR cv_ngram_binary and adjectives
Test:  0.47944412275622467
Test:  [[1534  176]
 [ 723  414]]
is_murder TRAIN FOR cv_ngram_binary and numeral
Test:  0.27726432532347506
Test:  [[1449  261]
 [ 912  225]]
is_murder TRAIN FOR cv and plot
Test:  0.5938954992240042
Test:  [[1488  222]
 [ 563  574]]
is_murder TRAIN FOR cv and verbs
Test:  0.4444444444444444
Test:  [[1489  221]
 [ 749  388]]
is_murder TRAIN FOR cv and nouns
Test:  0.6038696537678209
Test:  [[1476  234]
 [ 544  593]]
is_murder TRAIN FOR cv and adjectives
Test:  0.5073115860517435
Test:  [[1520  190]
 [ 686  451]]
is_murder TRAIN FOR cv and numeral
Test:  0.3223981900452489
Test:  [[1364  346]
 [ 8

In [58]:
clf = GradientBoostingClassifier()
output_gbc = train_with_settings(genres, vect_strategies, feature_strategies, clf)


is_murder TRAIN FOR cv_ngram_binary and plot
Test:  0.6774641369736233
Test:  [[1418  292]
 [ 405  732]]
is_murder TRAIN FOR cv_ngram_binary and verbs
Test:  0.5347804637395165
Test:  [[1362  348]
 [ 595  542]]
is_murder TRAIN FOR cv_ngram_binary and nouns
Test:  0.6707089552238805
Test:  [[1422  288]
 [ 418  719]]
is_murder TRAIN FOR cv_ngram_binary and adjectives
Test:  0.5294748124330116
Test:  [[1475  235]
 [ 643  494]]
is_murder TRAIN FOR cv_ngram_binary and numeral
Test:  0.1081081081081081
Test:  [[1657   53]
 [1069   68]]
is_murder TRAIN FOR cv and plot
Test:  0.6710097719869706
Test:  [[1419  291]
 [ 416  721]]
is_murder TRAIN FOR cv and verbs
Test:  0.5377215189873419
Test:  [[1403  307]
 [ 606  531]]
is_murder TRAIN FOR cv and nouns
Test:  0.6713814238566714
Test:  [[1438  272]
 [ 425  712]]
is_murder TRAIN FOR cv and adjectives
Test:  0.5345474022495983
Test:  [[1479  231]
 [ 638  499]]
is_murder TRAIN FOR cv and numeral
Test:  0.0856911883589329
Test:  [[1663   47]
 [1084 

In [59]:
def select_best_clf(dict_of_res):
    all_ = [v for k, v in dict_of_res.items() if genre in k]
    all_metrics_test = [x['f1_test'] for x in all_]
    best_for_genre = max(all_metrics_test)
    index_mnb = all_metrics_test.index(best_for_genre)
    return all_[index_mnb]

In [60]:
best_clfs = {}
for genre in genres:
    best_dec_trees = select_best_clf(output_dec_trees)
    best_mnb = select_best_clf(output_mnb)
    best_knn = select_best_clf(output_knn)
    best_rfc = select_best_clf(output_rfc)
    best_gbc = select_best_clf(output_gbc)
    
    print(f'Best test scores for genre {genre}: ')
    print(f'Decision Tree: {best_dec_trees["f1_test"]}')
    print(f'Multinomial NB: {best_mnb["f1_test"]}')
    print(f'KNN: {best_knn["f1_test"]}')
    print(f'Random Forest: {best_rfc["f1_test"]}')
    print(f'Gradient Boosting: {best_gbc["f1_test"]}')
    
    all_ = [best_dec_trees, best_mnb, best_knn, best_rfc, best_gbc]
    all_m = [x['f1_test'] for x in all_]
    best_for_genre = max(all_m)
    index_mnb = all_m.index(best_for_genre)
    best_clfs[genre] = all_[index_mnb]

Best test scores for genre is_murder: 
Decision Tree: 0.6113342257920571
Multinomial NB: 0.6597582037996547
KNN: 0.49605411499436314
Random Forest: 0.6201705970898144
Gradient Boosting: 0.6878980891719746
Best test scores for genre is_romantic: 
Decision Tree: 0.3374880153403644
Multinomial NB: 0.46972176759410805
KNN: 0.14222222222222222
Random Forest: 0.09024745269286755
Gradient Boosting: 0.20974889217134413
Best test scores for genre is_comedy: 
Decision Tree: 0.18524332810047095
Multinomial NB: 0.2874806800618238
KNN: 0.05025125628140704
Random Forest: 0.04145077720207253
Gradient Boosting: 0.06532663316582914
Best test scores for genre is_fantasy: 
Decision Tree: 0.4716981132075472
Multinomial NB: 0.5303030303030303
KNN: 0.29333333333333333
Random Forest: 0.3466666666666667
Gradient Boosting: 0.49756097560975604
Best test scores for genre is_flashback: 
Decision Tree: 0.31205673758865243
Multinomial NB: 0.3028229255774166
KNN: 0.10212765957446811
Random Forest: 0.1042944785276073

In [90]:
y_true_test_overall = np.concatenate([
    best_clfs['is_murder']['y_val_pred'].reshape(-1, 1),
    best_clfs['is_romantic']['y_val_pred'].reshape(-1, 1),
    best_clfs['is_comedy']['y_val_pred'].reshape(-1, 1),
    best_clfs['is_fantasy']['y_val_pred'].reshape(-1, 1),
    best_clfs['is_flashback']['y_val_pred'].reshape(-1, 1)
], axis = 1)

y_pred_test_overall = np.concatenate([
    best_clfs['is_murder']['y_test_all_pred'].reshape(-1, 1),
    best_clfs['is_romantic']['y_test_all_pred'].reshape(-1, 1),
    best_clfs['is_comedy']['y_test_all_pred'].reshape(-1, 1),
    best_clfs['is_fantasy']['y_test_all_pred'].reshape(-1, 1),
    best_clfs['is_flashback']['y_test_all_pred'].reshape(-1, 1)
], axis = 1)

In [91]:
y_pred_test_overall

array([[0, 1, 0, 0, 0],
       [1, 0, 1, 0, 0],
       [0, 1, 1, 0, 0],
       ...,
       [1, 0, 1, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0]], dtype=uint8)

In [92]:
ans_df = pd.DataFrame(y_pred_test_overall)
ans_df

Unnamed: 0,0,1,2,3,4
0,0,1,0,0,0
1,1,0,1,0,0
2,0,1,1,0,0
3,1,0,0,0,0
4,0,1,1,0,1
...,...,...,...,...,...
5334,1,0,0,0,0
5335,0,1,0,0,0
5336,1,0,1,0,0
5337,1,0,0,0,0


In [110]:
tags_list = ["murder", "romantic", "comedy", "fantasy", "flashback"]
temp_arr = []

for i in y_pred_test_overall:
    temp_s = []
    for j in range(5):
        if i[j] == 1:
            temp_s.append(tags_list[j])
    temp_arr.append(', '.join(temp_s))


temp_arr

['romantic',
 'murder, comedy',
 'romantic, comedy',
 'murder',
 'romantic, comedy, flashback',
 '',
 'murder',
 'murder',
 'romantic',
 'romantic, comedy',
 'murder',
 '',
 'murder',
 'romantic',
 'flashback',
 'murder, romantic',
 'flashback',
 'murder',
 'murder, flashback',
 '',
 'murder',
 '',
 'comedy',
 '',
 'murder',
 '',
 'murder, comedy, flashback',
 'comedy, flashback',
 'murder, comedy',
 'murder, comedy',
 'murder, comedy',
 'romantic',
 'romantic',
 'murder',
 'murder',
 'comedy',
 'murder, comedy, flashback',
 'comedy',
 'murder, flashback',
 '',
 'murder',
 'murder, comedy',
 'murder, comedy',
 'murder, flashback',
 '',
 'romantic',
 'murder',
 'fantasy',
 'murder, comedy',
 'murder, comedy',
 'murder, comedy',
 'murder',
 'fantasy',
 'murder',
 'murder',
 'murder, flashback',
 '',
 '',
 'fantasy, flashback',
 'romantic',
 'romantic',
 'romantic',
 'romantic',
 'romantic',
 'romantic',
 'romantic, comedy',
 'romantic',
 'romantic',
 '',
 'fantasy',
 'murder',
 '',
 'mur

In [111]:
answer = pd.DataFrame()
answer['id'] = test_id

answer['tags'] = pd.DataFrame(temp_arr)
answer

Unnamed: 0,id,tags
0,tt0033045,romantic
1,tt0086250,"murder, comedy"
2,tt1315981,"romantic, comedy"
3,tt1937113,murder
4,tt1619029,"romantic, comedy, flashback"
...,...,...
5334,tt1869716,murder
5335,tt0025601,romantic
5336,tt0219952,"murder, comedy"
5337,tt0039464,murder


In [115]:
answer.to_csv('lab_1_Filistovich.csv', index=False)  