In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.tokenize import RegexpTokenizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer


In [2]:
df = pd.read_csv("cleanCMURevenueAdjusted.tsv", sep="\t")
df.head()

Unnamed: 0,WikiID,Name,Revenue,ReleaseDate,Languages,Countries,Genres,Summary,Year,AdjustedRevenue
0,975900,Ghosts of Mars,14010832.0,2001-08-24,['English Language'],['United States of America'],"['Thriller', 'Science Fiction', 'Horror', 'Adv...","Set in the second half of the 22nd century, th...",2001.0,20224680.0
1,9363483,White Of The Eye,,1987,['English Language'],['United Kingdom'],"['Thriller', 'Erotic thriller', 'Psychological...",A series of murders of rich young women throug...,1987.0,
2,261236,A Woman in Flames,,1983,['German Language'],['Germany'],['Drama'],"Eva, an upper class housewife, becomes frustra...",1983.0,
3,18998739,The Sorcerer's Apprentice,,2002,['English Language'],['South Africa'],"['Family Film', 'Fantasy', 'Adventure', 'World...","Every hundred years, the evil Morgana returns...",2002.0,
4,6631279,Little city,,1997-04-04,['English Language'],['United States of America'],"['Romantic comedy', 'Ensemble Film', 'Comedy-d...","Adam, a San Francisco-based artist who works a...",1997.0,


In [3]:

df["Decade"] = 10 * np.round(df["Year"]/10)
df.head()

Unnamed: 0,WikiID,Name,Revenue,ReleaseDate,Languages,Countries,Genres,Summary,Year,AdjustedRevenue,Decade
0,975900,Ghosts of Mars,14010832.0,2001-08-24,['English Language'],['United States of America'],"['Thriller', 'Science Fiction', 'Horror', 'Adv...","Set in the second half of the 22nd century, th...",2001.0,20224680.0,2000.0
1,9363483,White Of The Eye,,1987,['English Language'],['United Kingdom'],"['Thriller', 'Erotic thriller', 'Psychological...",A series of murders of rich young women throug...,1987.0,,1990.0
2,261236,A Woman in Flames,,1983,['German Language'],['Germany'],['Drama'],"Eva, an upper class housewife, becomes frustra...",1983.0,,1980.0
3,18998739,The Sorcerer's Apprentice,,2002,['English Language'],['South Africa'],"['Family Film', 'Fantasy', 'Adventure', 'World...","Every hundred years, the evil Morgana returns...",2002.0,,2000.0
4,6631279,Little city,,1997-04-04,['English Language'],['United States of America'],"['Romantic comedy', 'Ensemble Film', 'Comedy-d...","Adam, a San Francisco-based artist who works a...",1997.0,,2000.0


In [4]:
df.Decade.unique()

array([2000., 1990., 1980.,   nan, 1960., 1970., 2010., 1940., 1950.,
       1910., 1930., 1920., 1900., 1890.])

In [5]:
df.Decade.dropna()

0        2000.0
1        1990.0
2        1980.0
3        2000.0
4        2000.0
5        1990.0
6        1990.0
8        1960.0
9        1970.0
10       2010.0
11       2010.0
12       2000.0
13       1980.0
14       1990.0
15       1990.0
16       2010.0
17       2000.0
18       2010.0
19       1990.0
20       2000.0
21       2000.0
22       1980.0
23       2010.0
24       1990.0
25       2010.0
26       1990.0
27       1990.0
28       1940.0
29       2010.0
30       1980.0
          ...  
42173    1990.0
42174    2000.0
42175    1940.0
42176    2000.0
42177    2000.0
42178    1940.0
42179    1970.0
42181    2000.0
42182    1990.0
42183    1980.0
42184    1930.0
42185    2010.0
42186    1960.0
42187    2010.0
42188    2000.0
42189    1940.0
42190    1970.0
42191    1990.0
42192    1950.0
42193    1960.0
42194    2000.0
42195    1990.0
42196    2000.0
42197    1990.0
42198    2010.0
42199    1940.0
42200    2010.0
42201    2010.0
42202    1990.0
42203    2000.0
Name: Decade, Length: 39

In [6]:
d={}
a=[]
maxLabel=0

for decade in df["Decade"]:
    try:
        a.append(d[decade])
    except KeyError:
        a.append(maxLabel)
        d[decade]=maxLabel
        maxLabel +=1
df["Decade"] = a

In [7]:
df.head()

Unnamed: 0,WikiID,Name,Revenue,ReleaseDate,Languages,Countries,Genres,Summary,Year,AdjustedRevenue,Decade
0,975900,Ghosts of Mars,14010832.0,2001-08-24,['English Language'],['United States of America'],"['Thriller', 'Science Fiction', 'Horror', 'Adv...","Set in the second half of the 22nd century, th...",2001.0,20224680.0,0
1,9363483,White Of The Eye,,1987,['English Language'],['United Kingdom'],"['Thriller', 'Erotic thriller', 'Psychological...",A series of murders of rich young women throug...,1987.0,,1
2,261236,A Woman in Flames,,1983,['German Language'],['Germany'],['Drama'],"Eva, an upper class housewife, becomes frustra...",1983.0,,2
3,18998739,The Sorcerer's Apprentice,,2002,['English Language'],['South Africa'],"['Family Film', 'Fantasy', 'Adventure', 'World...","Every hundred years, the evil Morgana returns...",2002.0,,0
4,6631279,Little city,,1997-04-04,['English Language'],['United States of America'],"['Romantic comedy', 'Ensemble Film', 'Comedy-d...","Adam, a San Francisco-based artist who works a...",1997.0,,0


In [8]:
train, test = train_test_split(df, test_size=0.3)
train = train.dropna()
test = test.dropna()

In [9]:
train.head()

Unnamed: 0,WikiID,Name,Revenue,ReleaseDate,Languages,Countries,Genres,Summary,Year,AdjustedRevenue,Decade
7609,8737659,Speed Racer,93945766.0,2008-04-28,['English Language'],"['United States of America', 'Australia', 'Ger...","['Adventure', 'Chase Movie', 'Sports', 'Advent...",Speed Racer is an 18-year-old whose life and...,2008.0,111512900.0,6
35802,200694,Clerks.,3151130.0,1994-01,['English Language'],['United States of America'],"['Buddy film', 'Indie', 'Black-and-white', 'Cu...","On April 10, 1993, Dante Hicks , 22, a retail ...",1994.0,5430784.0,1
39013,24593335,The Tree,2219182.0,2010-08-11,['English Language'],"['France', 'Australia']","['Drama', 'Indie', 'World cinema']",Dawn and Peter O’Neil live together with their...,2010.0,2600922.0,6
17117,27602393,Monsters,4242978.0,2010-09-30,"['English Language', 'Spanish Language']",['United Kingdom'],"['Thriller', 'Drama', 'Romance Film', 'Horror'...",After a NASA deep-space probe crash lands in M...,2010.0,4972848.0,6
28791,2436406,48 Hrs.,78868508.0,1982-12-08,['English Language'],['United States of America'],"['Thriller', 'Crime Fiction', 'Buddy film', 'A...",Convicted robber Albert Ganz is working as pa...,1982.0,208817700.0,2


In [10]:
train_y = train["Decade"]
# train_X = train.drop(["Decade", "Year", "ReleaseDate"], axis=1)
train_X = train[["Name", "Revenue", "Languages", "Countries", "Genres", "Summary", "AdjustedRevenue"]]

test_y = test["Decade"]
# train_X = train.drop(["Decade", "Year", "ReleaseDate"], axis=1)
test_X = test[["Name", "Revenue", "Languages", "Countries", "Genres", "Summary", "AdjustedRevenue"]]

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

titleVectorizer = CountVectorizer()
titleX = titleVectorizer.fit_transform(train["Name"])
df["tokenizedTitle"] = titleX

In [37]:
features = train[["Name", "Year", "Languages", "Countries", "Genres", "Summary"]]
label = train['Decade']

In [17]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(titleX.toarray(), label.values)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [18]:
testTitles = titleVectorizer.transform(test["Name"]).toarray()
y_pred3 = knn.predict(testTitles)
sum(y_pred3 == test['Decade'])/len(y_pred3)

0.21968365553602812

In [16]:
from sklearn.naive_bayes import GaussianNB
gb = GaussianNB()
gb.fit(titleX.toarray(), label.values)
testTitles = titleVectorizer.transform(test["Name"]).toarray()
y_pred3 = gb.predict(testTitles)
sum(y_pred3 == test['Decade'])/len(y_pred3)

0.15019937970757644

In [19]:
plotTokenizer = CountVectorizer()
trainPlots = plotTokenizer.fit_transform(train_X["Summary"])
testPlots = plotTokenizer.transform(test_X["Summary"])

In [21]:
gb = GaussianNB()
gb.fit(TrainPlotsVec, train_y)


GaussianNB(priors=None, var_smoothing=1e-09)

In [22]:
predict = gb.predict(testPlots.toarray())
sum(predict == test_y)/len(predict)

0.3141338059370846

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(trainPlots.toarray(), train_y)
predict = knn.predict(testPlots.toarray())
sum(predict == test_y)/len(predict)

In [41]:
ada = AdaBoostClassifier(n_estimators=250, learning_rate=1.0)
m=ada.fit(trainPlots.toarray(), train_y)

predict = m.predict(testPlots.toarray())
sum(predict == test_y)/len(predict)

KeyboardInterrupt: 

In [25]:
mnb =MultinomialNB (alpha=0.00, class_prior=None, fit_prior=True)
mnb.fit(trainPlots.toarray(), train_y)
predict = mnb.predict(testPlots.toarray())
sum(predict == test_y)/len(predict)

  'setting alpha = %.1e' % _ALPHA_MIN)


0.3349324029655473

In [30]:
bnb =BernoulliNB (alpha=0.5, binarize=0.0, class_prior=None, fit_prior=True)
bnb.fit(trainPlots.toarray(), train_y)
predict = bnb.predict(testPlots.toarray())
sum(predict == test_y)/len(predict)

0.3597906672481465