In [1]:
import pandas as pd
import numpy as np
import os

import plotly.graph_objs as go

In [2]:
train_df = pd.read_csv('../train.csv')

In [3]:
train_df.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0


### Looking at the number of duplicates (using plotly 3.0!)

In [4]:
dup_df = train_df.\
         groupby('is_duplicate',
                 as_index = False)\
         ['id'].\
         count()


In [5]:
fig = go.FigureWidget(
    data = [go.Bar(
        x=dup_df['is_duplicate'],
        y=dup_df['id']
    )],
    layout=go.Layout(
        title="Number of records for each 'is_duplicate' class",
        titlefont=dict(
            color='rgb(230,230,230)'
        ),
        xaxis=dict(
            title='is duplicate',
            color='rgb(230,230,230)'
        ),
        yaxis=dict(
            title='Count',
            color='rgb(230,230,230)'
        ),
        paper_bgcolor = 'rgb(44,48,60)',
        plot_bgcolor = 'rgb(44,48,60)'
    )
)


In [6]:
fig

### Let's take a look at how many times each question appears

In [7]:
train_df.columns

Index(['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate'], dtype='object')

In [8]:
questions = pd.concat([
                train_df['qid1'],
                train_df['qid2'].\
                    rename(columns={'qid2' : 'qid1'})
                ],
                axis=0
)

In [9]:
questions = pd.DataFrame(questions).\
    rename(columns={0:'qid'})
# adding dummy indicator for aggregation
questions['ind'] = 1

In [10]:
questions = questions.\
            groupby('qid',
                     as_index = False)\
            ['ind'].\
            count().\
            sort_values('ind',
                         ascending=False
            ).\
            rename(columns={'ind':'count'})

In [11]:
questions.head(3)

Unnamed: 0,qid,count
2558,2559,157
30781,30782,120
4043,4044,111


In [12]:
# set this parameter to be whatever you want.
num_questions = 50

q_fig = go.FigureWidget(
    data = [go.Bar(
        x=questions['qid'].\
          apply(lambda x: 'qid: %i' % x).\
          head(num_questions),
        y=questions['count'].\
          head(num_questions)
    )],
    layout=go.Layout(
        title="Number of questions appearing in the training set for each qid (top %i questions)" % num_questions,
        titlefont=dict(
            color='rgb(230,230,230)'
        ),
        xaxis=dict(
            title='qid',
            color='rgb(230,230,230)'
        ),
        yaxis=dict(
            title='Count',
            color='rgb(230,230,230)'
        ),
        paper_bgcolor = 'rgb(44,48,60)',
        plot_bgcolor = 'rgb(44,48,60)'
    )
)


In [13]:
q_fig

# So some questions appear multiple times in the dataset.

Lets take a look at qid: 2559


In [14]:
train_df[(train_df['qid1'] == 2559)].head(10)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
14712,14712,2559,4506,What are the best ways to lose weight?,What would be a realistic plan to lose weight?,1
38200,38200,2559,2711,What are the best ways to lose weight?,What is the best method of losing weight?,1
56239,56239,2559,10321,What are the best ways to lose weight?,Where do I find a simple to understand solutio...,1
81363,81363,2559,43605,What are the best ways to lose weight?,I'm overweight. How can I begin to lose weight?,1
81973,81973,2559,23803,What are the best ways to lose weight?,What is the fastest possible way to lose weight?,1
82016,82016,2559,7445,What are the best ways to lose weight?,What should I do to reduce weight?,1
86631,86631,2559,5358,What are the best ways to lose weight?,What's the best plan to lose weight?,1
89295,89295,2559,17100,What are the best ways to lose weight?,"I am ugly and fat, how to lose weight?",1
106632,106632,2559,10322,What are the best ways to lose weight?,How can I lose 4kg weight?,1
113625,113625,2559,2712,What are the best ways to lose weight?,What are the best way of loose the weight?,1


These are seemingly weight-loss questions.

Let's now bring this into a corpus.

In [15]:
corpus = pd.concat([
            train_df[['qid1','question1']],
            train_df[['qid2','question2']].\
                rename(columns={'qid2':'qid1',
                                'question2':'question1'}
                )],
            axis=0
        )

corpus = corpus.\
         rename(columns={'qid1':'qid',
                         'question1':'question'}
         ).\
         drop_duplicates()

In [16]:
corpus.head(3)

Unnamed: 0,qid,question
0,1,What is the step by step guide to invest in sh...
1,3,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,5,How can I increase the speed of my internet co...


## cleaning with nltk

In [17]:
import nltk

In [45]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
import string

In [19]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

### There may be spelling mistakes in the questions, but will try to find these later

In [20]:
def clean(doc):
    doc = str(doc)
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [21]:
corpus['question'] = corpus['question'].apply(clean)

## Looking at some of the words that appear most frequently

In [47]:
from sklearn.feature_extraction.text import CountVectorizer

In [48]:
vectorizer = CountVectorizer(min_df=1)

vectorizer = CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=np.float32, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [93]:
vector_matrix = vectorizer.fit_transform(corpus['question'].tolist()[:10000])

In [94]:
appearences = vector_matrix.sum(axis=0)
feat_names = vectorizer.get_feature_names()

In [107]:
word_counts = [{'word' : feat_names[i],
                'count' : appearences[0,i]} for i in range(np.shape(appearences)[1])]

word_counts = pd.DataFrame(word_counts).\
              sort_values('count', ascending=False)

In [138]:
# set this parameter to be whatever you want.
num_words = 20

w_fig = go.FigureWidget(
    data = [go.Bar(
        y=word_counts.\
          head(num_questions).\
          sort_values('count')\
          ['word'],
        x=word_counts.\
          head(num_questions).\
          sort_values('count')\
          ['count'],
        orientation = 'h'
    )],
    layout=go.Layout(
        title="Number of times each word appears in the corpus for %i words (descending order)" % num_questions,
        titlefont=dict(
            color='rgb(230,230,230)'
        ),
        xaxis=dict(
            title='qid',
            color='rgb(230,230,230)'
        ),
        yaxis=dict(
            title='Count',
            color='rgb(230,230,230)'
        ),
        paper_bgcolor = 'rgb(44,48,60)',
        plot_bgcolor = 'rgb(44,48,60)',
        font=dict(size=8),
        height=700
    )
)

In [139]:
w_fig

## Some observations from the data above:
* some words should be treated in the same foot (e.g. india and indian)
* best is the most dominant word in the corpus, questions are probably something like 'what is the best...'
* questions that are hypothetical may be characterised as having 'would'

...

## Using gensim to run a LDA model

In [22]:
from gensim import corpora, models


detected Windows; aliasing chunkize to chunkize_serial



Taking the first 10,000 records just to see what happens.

In [23]:
dictionary = corpora.Dictionary([x.split() for x in corpus['question'].tolist()[:10000]])

In [24]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc 
                   in [x.split() for x in corpus['question'].tolist()[:10000]]
                  ]

In [41]:
Lda = models.ldamodel.LdaModel
num_topics = 30
# Running and Trainign LDA model on the document term matrix.
# Use defaul priors, no callbacks yet.
ldamodel = Lda(corpus = doc_term_matrix, 
               num_topics=num_topics, 
               id2word = dictionary,
               distributed=False,
               passes=50,
               minimum_probability = 0.05
)

### Printing some topics.

In [42]:
print(ldamodel.print_topics(num_topics=num_topics, num_words=5))

[(0, '0.052*"thing" + 0.042*"know" + 0.035*"year" + 0.033*"take" + 0.033*"new"'), (1, '0.119*"make" + 0.036*"money" + 0.020*"american" + 0.017*"happens" + 0.016*"option"'), (2, '0.160*"best" + 0.091*"way" + 0.033*"online" + 0.024*"company" + 0.019*"much"'), (3, '0.046*"want" + 0.040*"engineering" + 0.037*"woman" + 0.028*"me" + 0.025*"do"'), (4, '0.053*"trump" + 0.035*"2016" + 0.030*"donald" + 0.021*"hillary" + 0.021*"clinton"'), (5, '0.026*"get" + 0.022*"you" + 0.022*"not" + 0.019*"skill" + 0.016*"energy"'), (6, '0.133*"india" + 0.030*"war" + 0.027*"travel" + 0.025*"increase" + 0.016*"world"'), (7, '0.040*"without" + 0.034*"buy" + 0.033*"phone" + 0.025*"data" + 0.018*"android"'), (8, '0.118*"people" + 0.078*"life" + 0.040*"love" + 0.031*"stop" + 0.017*"window"'), (9, '0.052*"find" + 0.033*"system" + 0.032*"old" + 0.028*"facebook" + 0.028*"business"'), (10, '0.074*"difference" + 0.053*"many" + 0.038*"english" + 0.033*"school" + 0.024*"type"'), (11, '0.140*"good" + 0.036*"example" + 0.03