In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
%matplotlib inline

In [3]:
import nltk
import re
import os
import pandas as pd
import numpy as np
import scipy as sp
import string
from bs4 import BeautifulSoup
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier, PassiveAggressiveClassifier, Perceptron
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, hstack

In [4]:
df = pd.read_csv('test.csv')
print(df.head())

      Id                                       Page content
0  27643  <html><head><div class="article-info"><span cl...
1  27644  <html><head><div class="article-info"><span cl...
2  27645  <html><head><div class="article-info"><span cl...
3  27646  <html><head><div class="article-info"><span cl...
4  27647  <html><head><div class="article-info"><span cl...


In [5]:
# print(np.unique(df['Popularity']))

In [6]:
def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text
# print(preprocessor(df.loc[0,'Page content']))

In [7]:
def tokenizer_stem(text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in re.split('\s+', text.strip())]
# print(tokenizer_stem(preprocessor(df.loc[0,'Page content'])))

In [8]:
# nltk.download('stopwords')
stop = stopwords.words('english')
def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]
print(tokenizer_stem_nostop(preprocessor(df.loc[0,'Page content'])))

['sam', 'laird2013', 'utcsocc', 'star', 'get', 'twitter', 'death', 'threat', 'tackl', 'one', 'direct', 'member', 'note', 'human', 'one', 'direct', 'fandom', 'nothin', 'mess', 'british', 'soccer', 'star', 'learn', 'hard', 'way', 'weekend', 'pop', 'band', 'fanboy', 'fangirl', 'brought', 'rucku', 'twitter', 'feed', 'tweet', 'fusillad', 'death', 'threat', 'follow', 'incident', 'run', 'sunday', 'chariti', 'soccer', 'match', 'see', 'also', 'gq', 'get', 'twitter', 'death', 'threat', 'one', 'direct', 'magazin', 'cover', 'gabriel', 'agbonlahor', 'time', 'lead', 'goal', 'scorer', 'english', 'premier', 'leagu', 'side', 'aston', 'villa', 'particip', 'chariti', 'match', 'honor', 'former', 'player', 'battl', 'leukemia', 'also', 'play', 'loui', 'tomlinson', 'one', 'fifth', 'british', 'irish', 'boy', 'band', 'one', 'direct', 'tomlinson', 'dribbl', 'ball', 'midway', 'sunday', 'match', 'pro', 'striker', 'agbonlahor', 'swoop', 'knock', 'away', 'tini', 'amateur', 'clean', 'play', 'sure', 'incident', 'cont

In [9]:
count = CountVectorizer(ngram_range=(1, 1),
                        preprocessor=preprocessor,
                        tokenizer=tokenizer_stem_nostop)
count.fit([df.loc[0,'Page content']])
# dictionary is stored in vocabulary_
BoW = count.vocabulary_
print('[vocabulary]\n{}'.format(BoW))


[vocabulary]
{'sam': 185, 'laird2013': 115, 'utcsocc': 228, 'star': 200, 'get': 80, 'twitter': 223, 'death': 38, 'threat': 212, 'tackl': 209, 'one': 156, 'direct': 43, 'member': 135, 'note': 148, 'human': 97, 'fandom': 60, 'nothin': 149, 'mess': 136, 'british': 21, 'soccer': 195, 'learn': 121, 'hard': 90, 'way': 232, 'weekend': 233, 'pop': 166, 'band': 13, 'fanboy': 59, 'fangirl': 61, 'brought': 22, 'rucku': 182, 'feed': 63, 'tweet': 222, 'fusillad': 75, 'follow': 70, 'incident': 103, 'run': 183, 'sunday': 205, 'chariti': 27, 'match': 134, 'see': 188, 'also': 4, 'gq': 85, 'magazin': 132, 'cover': 34, 'gabriel': 77, 'agbonlahor': 1, 'time': 214, 'lead': 119, 'goal': 82, 'scorer': 187, 'english': 50, 'premier': 170, 'leagu': 120, 'side': 193, 'aston': 8, 'villa': 230, 'particip': 158, 'honor': 94, 'former': 71, 'player': 165, 'battl': 15, 'leukemia': 124, 'play': 164, 'loui': 130, 'tomlinson': 217, 'fifth': 65, 'irish': 105, 'boy': 20, 'dribbl': 45, 'ball': 12, 'midway': 137, 'pro': 171,

In [10]:
doc_bag = count.transform([df.loc[0,'Page content']])
print('(did, vid)\ttf')
print(doc_bag)

print('\nIs document-term matrix a scipy.sparse matrix? {}'.format(sp.sparse.issparse(doc_bag)))
print(doc_bag.shape)

(did, vid)	tf
  (0, 0)	1
  (0, 1)	4
  (0, 2)	1
  (0, 3)	1
  (0, 4)	2
  (0, 5)	1
  (0, 6)	1
  (0, 7)	1
  (0, 8)	1
  (0, 9)	1
  (0, 10)	1
  (0, 11)	1
  (0, 12)	1
  (0, 13)	2
  (0, 14)	1
  (0, 15)	2
  (0, 16)	1
  (0, 17)	1
  (0, 18)	4
  (0, 19)	1
  (0, 20)	1
  (0, 21)	2
  (0, 22)	1
  (0, 23)	1
  (0, 24)	8
  :	:
  (0, 216)	1
  (0, 217)	4
  (0, 218)	1
  (0, 219)	1
  (0, 220)	1
  (0, 221)	1
  (0, 222)	1
  (0, 223)	5
  (0, 224)	2
  (0, 225)	1
  (0, 226)	1
  (0, 227)	1
  (0, 228)	1
  (0, 229)	6
  (0, 230)	1
  (0, 231)	1
  (0, 232)	3
  (0, 233)	1
  (0, 234)	1
  (0, 235)	1
  (0, 236)	1
  (0, 237)	1
  (0, 238)	1
  (0, 239)	6
  (0, 240)	1

Is document-term matrix a scipy.sparse matrix? True
(1, 241)


In [11]:
doc_bag = doc_bag.toarray()
# print(doc_bag)
# print('\nAfter calling .toarray(), is it a scipy.sparse matrix? {}'.format(sp.sparse.issparse(doc_bag)))

In [12]:
doc = df['Page content'].iloc[:3]
doc_bag = count.fit_transform(doc).toarray()

print("[most frequent vocabularies]")
bag_cnts = np.sum(doc_bag, axis=0)
top = 10
# [::-1] reverses a list since sort is in ascending order
for tok, v in zip(count.inverse_transform(np.ones((1, bag_cnts.shape[0])))[0][bag_cnts.argsort()[::-1][:top]], np.sort(bag_cnts)[::-1][:top]):
    print('{}: {}'.format(tok, v))

[most frequent vocabularies]
glass: 11
one: 10
googl: 8
direct: 8
bulli: 8
loui: 7
video: 6
officialofficedepot: 6
youtub: 6
septemb: 5


In [13]:
tfidf = TfidfVectorizer(ngram_range=(1,1),
                        preprocessor=preprocessor,
                        tokenizer=tokenizer_stem_nostop)
tfidf.fit(doc)

top = 10
# get idf score of vocabularies
idf = tfidf.idf_
print('[vocabularies with smallest idf scores]')
sorted_idx = idf.argsort()

for i in range(top):
    print('%s: %.2f' %(tfidf.get_feature_names()[sorted_idx[i]], idf[sorted_idx[i]]))

doc_tfidf = tfidf.transform(doc).toarray()
tfidf_sum = np.sum(doc_tfidf, axis=0)
print("\n[vocabularies with highest tf-idf scores]")
for tok, v in zip(tfidf.inverse_transform(np.ones((1, tfidf_sum.shape[0])))[0][tfidf_sum.argsort()[::-1]][:top], \
                        np.sort(tfidf_sum)[::-1][:top]):
    print('{}: {}'.format(tok, v))

[vocabularies with smallest idf scores]
imag: 1.00
topic: 1.00
also: 1.00
see: 1.00
launch: 1.29
u: 1.29
get: 1.29
howev: 1.29
current: 1.29
note: 1.29

[vocabularies with highest tf-idf scores]
glass: 0.6515168147554372
googl: 0.47383041073122706
consol: 0.351517515437782
ouya: 0.351517515437782
amazon: 0.351517515437782
one: 0.3400420551415553
game: 0.2812140123502256
kickstart: 0.2812140123502256
bulli: 0.2720336441132442
direct: 0.2720336441132442


In [14]:
# # hash words to 1024 buckets
# hashvec = HashingVectorizer(n_features=2**10,
#                             preprocessor=preprocessor,
#                             tokenizer=tokenizer_stem_nostop)

# # no .fit needed for HashingVectorizer, since it's defined by the hash function

# # transform sentences to vectors of dimension 1024
# doc_hash = hashvec.transform(doc)
# print(doc_hash.shape)
# print(doc_hash)

# Start Training

In [15]:
def get_stream(path, size):
    for chunk in pd.read_csv(path, chunksize=size):
        yield chunk

# print(next(get_stream(path='train_2.csv', size=10)))

In [16]:
# hashvec = HashingVectorizer(n_features=2**12, 
#                             preprocessor=preprocessor, tokenizer=tokenizer_stem_nostop)
# # loss='log' gives logistic regression
# clf = SGDClassifier(loss='log', max_iter=100, tol=1e-3)
# # clf = PassiveAggressiveClassifier(loss='log', max_iter=100, tol=1e-3)
# batch_size = 1000
# # stream = get_stream(path='train_2.csv', size=batch_size)
# classes = np.array([0, 1])
# train_auc, val_auc = [], []
# # we use one batch for training and another for validation in each iteration
# iters = int((26000+batch_size-1)/(batch_size))

# stream = get_stream(path='train_2.csv', size=batch_size)
# for i in range(iters):
#     if(i<iters*0.7):
#         batch = next(stream)
#         X_train, y_train = batch['Page content'], batch['Popularity']
#         if X_train is None:
#             break
#         X_train = hashvec.transform(X_train)
#         clf.partial_fit(X_train, y_train, classes=classes)
#         score = roc_auc_score(y_train, clf.predict_proba(X_train)[:,1])
#         train_auc.append(score)
#         print('train: [{}/{}] {}'.format((i+1)*(batch_size), 26000, score))

#     # validate
#     else:
#         batch = next(stream)
#         X_val, y_val = batch['Page content'], batch['Popularity']
#         score = roc_auc_score(y_val, clf.predict_proba(hashvec.transform(X_val))[:,1])
#         val_auc.append(score)
#         print('valid: [{}/{}] {}'.format((i+1)*(batch_size), 26000, score))

In [17]:
# plt.plot(val_auc)

In [18]:
# plt.plot(range(1, len(train_auc)+1), train_auc, color='blue', label='Train auc')
# plt.plot(range(1, len(train_auc)+1), val_auc, color='red', label='Val auc')
# plt.legend(loc="best")
# plt.xlabel('#Batches')
# plt.ylabel('Auc')
# plt.show()

In [None]:
df = pd.read_csv("train_2.csv")
X_train = df['Page content']
Y_train = df['Popularity']
# hashvec = HashingVectorizer(n_features=2**12, 
#                             preprocessor=preprocessor, tokenizer=tokenizer_stem_nostop)
# tfidf = TfidfVectorizer(ngram_range=(1,1),
#                         preprocessor=preprocessor,
#                         tokenizer=tokenizer_stem_nostop)
# tfidf.fit(data)
# data = tfidf.transform(data)

# tfidf = TfidfVectorizer(ngram_range=(1,1),
#                         max_features=10000,
#                         preprocessor=preprocessor,
#                         tokenizer=tokenizer_stem_nostop)

vect = TfidfVectorizer(min_df=5,
                       ngram_range=(2,4),
                       max_features=10000,
                      ).fit(X_train)
X_train_vectorized = vect.transform(X_train)

# tfidf.fit(X_train)


# clf = RandomForestClassifier(max_depth=6, n_estimators=1000, random_state=0)
# clf.fit(X_train, y_train)
# print("acc: ",clf.score(X_val, y_val))

In [None]:
print(X_train_vectorized.shape)

In [237]:
def get_key_words(data_X, data_y):
    num = 0
    score = 0
    ham=0
    spam=0
    for i in range(len(data_X)):
#         if "fuck" in data_X[i] and "shit" in data_X[i]:
        count = data_X[i].count(r'\d')
        if data_y[i]==1: #ham
            ham+=count
        else:
            spam+=count
    print(ham, spam)
    print(ham/spam)
get_key_words(X_train, Y_train)

0 0


ZeroDivisionError: division by zero

In [None]:
add_length=X_train.str.len()
add_digits=X_train.str.count(r'\d')
add_dollars=X_train.str.count(r'\$')
add_keyword1=X_train.str.count("killer")
add_keyword2=X_train.str.count("massacre")
add_keyword3=X_train.str.count("scandal")
add_keyword4=X_train.str.count("billion")
add_keyword4=X_train.str.count("fail")

In [None]:
print(np.sum(add_length))
print(np.sum(add_digits))
print(np.sum(add_dollars))
print(np.sum(add_keyword1))
print(np.sum(add_keyword2))
print(np.sum(add_keyword3))
print(np.sum(add_keyword4))

In [None]:
def add_feature(X, feature_to_add):    
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

In [None]:
X_train_transformed = add_feature(X_train_vectorized , [add_length, add_digits, add_dollars, add_keyword1, add_keyword2, add_keyword3, add_keyword4])

In [None]:
print(X_train_transformed.shape)

In [None]:
x_train, X_val, y_train, y_val = train_test_split(X_train_transformed, Y_train, test_size=0.2, shuffle=True, random_state=18)

In [None]:
print(x_train.shape)

In [None]:
clf = RandomForestClassifier(max_depth=8, n_estimators=600)
clf.fit(x_train, y_train)
print("acc: ",clf.score(x_train, y_train))
print("acc: ",clf.score(X_val, y_val))

# Make Prediction

In [234]:
# batch_size = 1
# stream = get_stream(path='test.csv', size=batch_size)
# classes = np.array([0, 1])
# pred=[]
# index=[]
# for i in range(11847):
#     batch = next(stream)
#     X_id, X_test = batch['Id'].item(), batch['Page content']
#     output = clf.predict_proba(hashvec.transform(X_test))[:,1]
#     index.append(X_id)
#     pred.append(output[0])

NameError: name 'get_stream' is not defined

In [236]:
clf.predict(X_test)

ValueError: could not convert string to float: '<html><head><div class="article-info"> <span class="byline basic">Clara Moskowitz</span> for <a href="/publishers/space-com/">Space.com</a> <time datetime="Wed, 19 Jun 2013 15:04:30 +0000">2013-06-19 15:04:30 UTC</time> </div></head><body><h1 class="title">NASA\'s Grand Challenge: Stop Asteroids From Destroying Earth</h1><figure class="article-image"><img class="microcontent" data-fragment="lead-image" data-image="http://i.amz.mshcdn.com/I7b9cUsPSztew7r1WT6_iBLjflo=/950x534/2013%2F06%2F19%2Ffe%2FDactyl.44419.jpg" data-micro="1" data-url="http://mashable.com/2013/06/19/nasa-grand-challenge-asteroid/" src="http://i.amz.mshcdn.com/I7b9cUsPSztew7r1WT6_iBLjflo=/950x534/2013%2F06%2F19%2Ffe%2FDactyl.44419.jpg"/></figure><article data-channel="world"><section class="article-content"> <p>There may be killer asteroids headed for Earth, and NASA has decided to do something about it. The space agency announced a new "Grand Challenge" on June 18 to find all dangerous space rocks and figure out how to stop them from destroying our planet.</p> <p>The new mission builds on projects already underway at NASA, including a plan to <a href="http://www.space.com/20591-nasa-asteroid-capture-mission-feasibility.html" target="_blank">capture an asteroid</a>, pull it in toward the moon and send astronauts to visit it. As part of the Grand Challenge, the agency issued a "request for information" today — aiming to solicit ideas from industry, academia and the public on how to improve the asteroid mission plan.</p> <p>"We\'re asking for you to think about concepts and different approaches for what we\'ve described here," William Gerstenmaier, NASA\'s associate administrator for human explorations and operations, said yesterday during a NASA event announcing the initiative. "We want you to think about other ways of enhancing this to get the most out of it."</p> <p><divclass><strong>SEE ALSO: <a href="http://www.space.com/20606-nasa-asteroid-capture-mission-images.html" target="_blank">How It Works: NASA Asteroid-Capture</a></strong><br><br>Responses to the request for information, which also seeks ideas for detecting and mitigating asteroid threats, are due July 18.<br><br>The asteroid-retrieval mission, designed to provide the first deep-space mission for astronauts flying on NASA\'s Space Launch System rocket and Orion space capsule under development, has come under fire from lawmakers who would prefer that NASA return to the moon.<br><br>A <a href="http://www.space.com/21609-nasa-asteroid-capture-mission-congress.html" target="_blank">draft NASA authorization bill</a> from the House space subcommittee, which is currently in debate, would cancel the mission and steer the agency toward other projects. That bill will be discussed during a hearing Wednesday, June 19 at 10 a.m. EDT.<br><br><divclass><strong>SEE ALSO: <a href="http://www.space.com/20606-nasa-asteroid-capture-mission-images.html" target="_blank">How It Works: NASA Asteroid-Capture Mission in Pictures</a></strong><br><br>But NASA officials defended the asteroid mission today and said they were confident they\'d win Congress\' support once they explained its benefits further.<br><br>"I think that we really, truly are going to be able to show the value of the mission," NASA Associate Administrator Lori Garver said today. "To me, this is something that what we do in this country — we debate how we spend the public\'s money. This is the beginning of the debate."<br><br>Garver also maintained that sending astronauts to an asteroid would not diminish NASA\'s other science and exploration goals, including another lunar landing.<br><br><divclass><strong>SEE ALSO: <a href="http://www.space.com/20601-animation-of-proposed-asteroid-retrieval-mission-video.html" target="_blank">Animation Of Proposed Asteroid Retrieval Mission</a></strong><br><br>"This initiative takes nothing from the other valuable work," she said. "This is only a small piece of our overall strategy, but it is an integral piece. It takes nothing from the moon."<br><br>Part of NASA\'s plan to win support for the flight is to link it more closely with the larger goal of protecting Earth from asteroid threats.<br><br>If, someday, humanity discovers an asteroid headed for Earth and manages to alter its course, "it will be one of the most important accomplishments in human history," said Tom Kalil, deputy director for technology and innovation at the White House Office of Science and Technology Policy.<br><br><divclass><strong>SEE ALSO: <a href="http://www.space.com/20006-deep-space-missions-private-companies.html" target="_blank">Wildest Private Deep-Space Mission Ideas: A Countdown</a></strong><br><br>The topic of asteroid threats is more timely than ever, after a meteor exploded over the Russian city of <a href="http://www.space.com/19823-russia-meteor-explosion-complete-coverage.html" target="_blank">Chelyabinsk</a> on Feb. 15 — the same day that the football field-sized <a href="http://www.space.com/19646-asteroid-2012-da14-earth-flyby-complete-coverage.html" target="_blank">asteroid 2012 DA14</a> passed within the moon\'s orbit of Earth.<br><br><em>Image courtesy of <a href="http://www.dvidshub.net/image/707596/ida-and-dactyl#.UcHDQvk4uSo" target="_blank">NASA</a></em></br></br></br></br></divclass></br></br></br></br></br></br></br></br></divclass></br></br></br></br></br></br></br></br></divclass></br></br></br></br></br></br></br></br></divclass></p> <ul> <li><a href="http://www.space.com/34406-spacexs-musk-says-sabotage-unlikely-cause-of-sept-1-explosion-but-still-a-worry.html">SpaceX\'s Musk Says Sabotage Unlikely Cause of Sept. 1 Explosion, But Still a Worry</a></li> <li><a href="http://www.space.com/34405-proxima-centauri-starspots-stellar-cycle-habitable-planet-alien-life.html">Proxima Centauri Is Like Our Sun... on Steroids</a></li> <li><a href="http://www.space.com/34404-china-launches-shenzhou-11-astronauts-to-space-lab.html">China Launches Shenzhou-11 Astronauts to Tiangong-2 Space Lab</a></li> <li><a href="http://www.space.com/34403-space-station-mockup-in-houston-astronaut-guided-tour-video.html">Space Station Mockup In Houston - Astronaut Guided Tour | Video</a></li> </ul> <p> This article originally published at Space.com <a href="http://www.space.com/21610-nasa-asteroid-threat-grand-challenge.html?">here</a> </p> </section></article><footer class="article-topics"> Topics: <a href="/category/asteroid/">Asteroid</a>, <a href="/category/asteroids/">Asteroids</a>, <a href="/category/challenge/">challenge</a>, <a href="/category/earth/">Earth</a>, <a href="/category/space/">Space</a>, <a href="/category/us/">U.S.</a>, <a href="/category/world/">World</a> </footer></body></html>'

In [None]:
print(len(pred), len(index))

In [None]:
dict = {'Id': index, 'Popularity': pred}
predict = pd.DataFrame(dict) 
predict.to_csv('prediction.csv', index=False)