# Model the Problem

## Preprocessing the data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data_tau_ta.csv')

In [3]:
df.head()

Unnamed: 0,title,date,days,tokens,stem,lemma,pos_tags,named_entities
0,10 Years of Open Source Machine Learning,7 points by tstonez 19 hours ago | 1 comment,1,"['10', 'years', 'open', 'source', 'machine', '...",10 Years of Open Source Machine Learn,10 Years of Open Source Machine Learning,"[('10', 'CD'), ('Years', 'NNS'), ('of', 'IN'),...",['Open Source Machine']
1,What machines can learn from Apple Watch: dete...,2 points by koukouhappy 7 hours ago | discuss,1,"['machines', 'learn', 'apple', 'watch', 'detec...",What machines can learn from Apple Watch: dete...,What machines can learn from Apple Watch: dete...,"[('What', 'WP'), ('machines', 'NNS'), ('can', ...",['Apple Watch']
2,The Deep Roots of Javascript Fatigue,3 points by nikkielizdemere 13 hours ago | di...,1,"['deep', 'roots', 'javascript', 'fatigue']",The Deep Roots of Javascript Fatigu,The Deep Roots of Javascript Fatigue,"[('The', 'DT'), ('Deep', 'NNP'), ('Roots', 'NN...","['Deep Roots', 'Javascript Fatigue']"
3,Data science intro for math/phys background,9 points by pmigdal 1 day ago | discuss,1,"['data', 'science', 'intro', 'math', 'phys', '...",Data science intro for math/phys background,Data science intro for math/phys background,"[('Data', 'NNP'), ('science', 'NN'), ('intro',...",['Data']
4,"Data Science Pop-Up in Austin, TX",2 points by AnnaOnTheWeb 13 hours ago | discuss,1,"['data', 'science', 'pop', 'austin', 'tx']","Data Science Pop-Up in Austin, TX","Data Science Pop-Up in Austin, TX","[('Data', 'NNP'), ('Science', 'NNP'), ('Pop', ...","['Data Science Pop', 'Austin']"


In [4]:
df.shape

(180, 8)

In [5]:
import nltk

In [6]:
from nltk.corpus import stopwords

In [7]:
stop = stopwords.words('english')

In [8]:
stop.extend(('.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}','/','-'))

In [9]:
tokens_list = df['tokens'].tolist()

In [10]:
tokens_list

["['10', 'years', 'open', 'source', 'machine', 'learning']",
 "['machines', 'learn', 'apple', 'watch', 'detecting', 'undiagnosed', 'heart', 'condition']",
 "['deep', 'roots', 'javascript', 'fatigue']",
 "['data', 'science', 'intro', 'math', 'phys', 'background']",
 "['data', 'science', 'pop', 'austin', 'tx']",
 "['data', 'science', 'tools', 'biggest', 'winners', 'losers']",
 "['analyzing', 'golden', 'state', 'warriors', 'passing', 'network', 'using', 'graphframes', 'spark']",
 "['making', 'transparent', 'variations', 'analytical', 'choices', 'affect', 'results']",
 "['neural', 'networks', 'demystified']",
 "['conversion', 'rate', 'changed', 'bayesian', 'timeseries', 'analysis', 'python']",
 "['descriptive', 'statistics', 'sql']",
 "['xgboost4j', 'portable', 'distributed', 'xgboost', 'spark', 'flink', 'dataflow']",
 "['introduction', 'scikit', 'flow', 'simplified', 'interface', 'tensorflow']",
 "['personality', 'space', 'cartoon', 'characters']",
 "['machine', 'learning', 'depth', 'non'

In [11]:
# Let us get the frequency count
frequency_words = {}
for data in tokens_list:
    data = data.replace("[","")
    data = data.replace("]","")
    data = data.replace("'","")
    data_list = data.split(',')
    for token in data_list:
        if token not in stop:
            if token in frequency_words:
                count = frequency_words[token]
                count = count + 1
                frequency_words[token] = count
            else:
                frequency_words[token] = 1

In [12]:
frequency_words

{'': 1,
 ' #': 1,
 ' &': 1,
 ' ***': 1,
 ' +': 2,
 ' ++': 1,
 ' .*:': 1,
 ' 0': 2,
 ' 1': 5,
 ' 101': 1,
 ' 16': 1,
 ' 2': 5,
 ' 2016': 2,
 ' 3': 3,
 ' 4': 1,
 ' 50': 2,
 ' 675': 1,
 ' 88': 1,
 ' 8m': 1,
 ' ??': 1,
 ' ???': 3,
 ' @': 2,
 ' affect': 1,
 ' age': 1,
 ' agree': 1,
 ' aka': 1,
 ' algorithms': 1,
 ' almost': 1,
 ' amazon': 1,
 ' analogies': 1,
 ' analysis': 9,
 ' analytical': 1,
 ' analytics': 1,
 ' analyzer': 1,
 ' analyzing': 1,
 ' ancient': 1,
 ' animated': 1,
 ' anywhere': 1,
 ' apache': 5,
 ' api': 4,
 ' app': 1,
 ' apple': 1,
 ' archive': 1,
 ' arrow': 1,
 ' art': 1,
 ' article': 1,
 ' artificial': 2,
 ' artists': 1,
 ' asked': 1,
 ' austin': 1,
 ' authoring': 1,
 ' automated': 1,
 ' aws': 1,
 ' aylien': 1,
 ' b': 1,
 ' background': 1,
 ' based': 2,
 ' basket': 1,
 ' bay': 1,
 ' bayesian': 4,
 ' beginners': 1,
 ' bengio': 1,
 ' best': 1,
 ' better': 2,
 ' big': 2,
 ' biggest': 1,
 ' blending': 1,
 ' blogs': 1,
 ' boosting': 2,
 ' bootstrap': 1,
 ' bowl': 1,
 ' building

## Term Frequency and Inverse Document Frequency

tf–idf, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.[1]:8 It is often used as a weighting factor in information retrieval and text mining. The tf-idf value increases proportionally to the number of times a word appears in the document, but is offset by the frequency of the word in the corpus, which helps to adjust for the fact that some words appear more frequently in general.

Variations of the tf–idf weighting scheme are often used by search engines as a central tool in scoring and ranking a document's relevance given a user query. tf–idf can be successfully used for stop-words filtering in various subject fields including text summarization and classification.

Let us start with the **"Term Frequency" - TF**

In [13]:
df_tfidf = pd.DataFrame(data=list(frequency_words.items()),columns=['word','tf'])

In [14]:
df_tfidf.head()

Unnamed: 0,word,tf
0,bootstrap,1
1,,1
2,caffe,1
3,data,8
4,pylearn2,1


In [15]:
df_tfidf.sort_values(ascending=False, by = "tf", inplace=True)

In [16]:
df_tfidf.head()

Unnamed: 0,word,tf
441,data,34
507,learning,28
466,science,21
116,r,11
358,machine,11


Let us get in how many documents (each title) does the word occur

In [23]:
def get_documents_count(row):
    document_counter = 0
    word = row['word']
    for document in df.tokens:
        document = document.replace("'",'')
        document = document.replace("[",'')
        document = document.replace("]",'')
        if word in document:
            document_counter = document_counter + 1
        print(document)
        print(document_counter)
        break
    return document_counter

In [24]:
df_tfidf['document_count'] = df_tfidf.apply(get_documents_count,axis=1)

10, years, open, source, machine, learning
0
10, years, open, source, machine, learning
1
10, years, open, source, machine, learning
0
10, years, open, source, machine, learning
0
10, years, open, source, machine, learning
1
10, years, open, source, machine, learning
0
10, years, open, source, machine, learning
0
10, years, open, source, machine, learning
0
10, years, open, source, machine, learning
0
10, years, open, source, machine, learning
0
10, years, open, source, machine, learning
0
10, years, open, source, machine, learning
0
10, years, open, source, machine, learning
1
10, years, open, source, machine, learning
1
10, years, open, source, machine, learning
0
10, years, open, source, machine, learning
0
10, years, open, source, machine, learning
0
10, years, open, source, machine, learning
0
10, years, open, source, machine, learning
0
10, years, open, source, machine, learning
0
10, years, open, source, machine, learning
0
10, years, open, source, machine, learning
0
10, years,

In [19]:
df_tfidf.head()

Unnamed: 0,word,tf,document_count
441,data,34,37
507,learning,28,28
466,science,21,21
116,r,11,32
358,machine,11,11


In [20]:
df_tfidf.tail()

Unnamed: 0,word,tf,document_count
252,certified,1,1
253,way,1,1
255,principal,1,1
256,analytical,1,1
695,parsing,1,1


In [None]:
# we already have the count of all the documents
total_docs = df.shape[0]

In [None]:
total_docs

** let us compute the tfidf ** 

**tfidf = tf . idf**

**idf = log(total_docs/number of documents that contain the word)**

In [None]:
import math
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def compute_tfidf(row):
    idf = math.log10(total_docs/row['document_count'])
    return row['tf'] * idf

In [None]:
df_tfidf['tfidf'] = df_tfidf.apply(compute_tfidf,axis=1)

In [None]:
df_tfidf.head()

In [None]:
df_tfidf.tail()

In [None]:
df_tfidf.sort(columns='tfidf',ascending=True,inplace=True)

In [None]:
df_tfidf.head()

In [None]:
df_tfidf.replace(to_replace=0.0,value=0.1,inplace=True)

In [None]:
df_tfidf.head()

In [None]:
df_tfidf.set_index('word', inplace=True)

In [None]:
df_tfidf.head()

** now let us plot a word cloud to see the prominence of the word **

In [None]:
wordcloud = WordCloud()

In [None]:
word_tfidf = df_tfidf['tfidf'].to_dict()

In [None]:
wordcloud.generate_from_frequencies(word_tfidf.items())
plt.figure(figsize=(14,10))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

# Topic modelling


In [None]:
import lda
import numpy as np
import lda.datasets
import sklearn.feature_extraction.text as text

** we are using the pre built reuters data set **

Generating the document term matrix

In [None]:
vectorizer = text.CountVectorizer(input='content', stop_words='english', min_df=1)

In [None]:
dtm = vectorizer.fit_transform(df.title).toarray()

In [None]:
dtm

Loading the vocabulary

In [None]:
vocab = np.array(vectorizer.get_feature_names())

In [None]:
vocab

In [None]:
titles = df.title


In [None]:
model = lda.LDA(n_topics=5, n_iter=500, random_state=1)

In [None]:
model.fit(dtm)

In [None]:
model.topic_word_

In [None]:
topic_word = model.topic_word_ 

In [None]:
topic_word

In [None]:
n_top_words = 8

In [None]:
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

In [None]:
doc_topic = model.doc_topic_

In [None]:
for n in range(10):
    topic_most_pr = doc_topic[n].argmax()
    print("topic: {} , {}".format(topic_most_pr,titles[n]))

# Sentiment Analysis


In [None]:
from nltk.classify import NaiveBayesClassifier
import math
import collections

In [None]:
pos_features = []
neg_features = []

In [None]:
def make_full_dict(word):
    return dict([(word, True)])

In [None]:
with open('postive_words.txt','r') as posFile:
    lines = posFile.readlines()
    for line in lines:
        pos_features.append([make_full_dict(line.rstrip()),'pos'])
        

In [None]:
pos_features

In [None]:
with open('negative_words.txt','r',encoding='utf-8') as negFile:
    lines = negFile.readlines()
    for line in lines:
        neg_features.append([make_full_dict(line.rstrip()),'neg'])

In [None]:
neg_features

In [None]:
len(pos_features),len(neg_features)

In [None]:
trainFeatures = pos_features + neg_features

In [None]:
trainFeatures

In [None]:
classifier = NaiveBayesClassifier.train(trainFeatures)

In [None]:
referenceSets = collections.defaultdict(set)
testSets = collections.defaultdict(set)

In [None]:
def make_full_dict_sent(words):
    return dict([(word, True) for word in words])

In [None]:
import re

In [None]:
neg_test = 'I hate data science'

In [None]:
title_words = re.findall(r"[\w']+|[.,!?;]", 'I have a love and hate relationship with data science')

In [None]:
title_words

In [None]:
test=[]

In [None]:
test.append([make_full_dict_sent(title_words),''])

In [None]:
test

In [None]:
for i, (features, label) in enumerate(test):
    predicted = classifier.classify(features)
    print(predicted)

In [None]:
for doc in df.title:
    title_words = re.findall(r"[\w']+|[.,!?;]", doc)
    test = []
    test.append([make_full_dict_sent(title_words),''])
    for i, (features, label) in enumerate(test):
        predicted = classifier.classify(features)
        print(predicted,doc)
    