In [1]:
import pandas as pd
import numpy as np
import os

import plotly.graph_objs as go

In [2]:
train_df = pd.read_csv('../train.csv')

In [3]:
train_df.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0


### Looking at the number of duplicates (using plotly 3.0!)

In [4]:
dup_df = train_df.\
         groupby('is_duplicate',
                 as_index = False)\
         ['id'].\
         count()


In [32]:
fig = go.FigureWidget(
    data = [go.Bar(
        x=dup_df['is_duplicate'],
        y=dup_df['id']
    )],
    layout=go.Layout(
        title="Number of records for each 'is_duplicate' class",
        titlefont=dict(
            color='rgb(230,230,230)'
        ),
        xaxis=dict(
            title='is duplicate',
            color='rgb(230,230,230)'
        ),
        yaxis=dict(
            title='Count',
            color='rgb(230,230,230)'
        ),
        paper_bgcolor = 'rgb(44,48,60)',
        plot_bgcolor = 'rgb(44,48,60)'
    )
)


In [33]:
fig

### Let's take a look at how many times each question appears

In [7]:
train_df.columns

Index(['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate'], dtype='object')

In [8]:
questions = pd.concat([
                train_df['qid1'],
                train_df['qid2'].\
                    rename(columns={'qid2' : 'qid1'})
                ],
                axis=0
)

In [9]:
questions = pd.DataFrame(questions).\
    rename(columns={0:'qid'})
# adding dummy indicator for aggregation
questions['ind'] = 1

In [10]:
questions = questions.\
            groupby('qid',
                     as_index = False)\
            ['ind'].\
            count().\
            sort_values('ind',
                         ascending=False
            ).\
            rename(columns={'ind':'count'})

In [11]:
questions.head(3)

Unnamed: 0,qid,count
2558,2559,157
30781,30782,120
4043,4044,111


In [34]:
# set this parameter to be whatever you want.
num_questions = 50

q_fig = go.FigureWidget(
    data = [go.Bar(
        x=questions['qid'].\
          apply(lambda x: 'qid: %i' % x).\
          head(num_questions),
        y=questions['count'].\
          head(num_questions)
    )],
    layout=go.Layout(
        title="Number of questions appearing in the training set for each qid (top %i questions)" % num_questions,
        titlefont=dict(
            color='rgb(230,230,230)'
        ),
        xaxis=dict(
            title='qid',
            color='rgb(230,230,230)'
        ),
        yaxis=dict(
            title='Count',
            color='rgb(230,230,230)'
        ),
        paper_bgcolor = 'rgb(44,48,60)',
        plot_bgcolor = 'rgb(44,48,60)'
    )
)


In [35]:
q_fig