In [42]:
import pandas as pd
import plotly.graph_objs as go

from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from plotly.offline import init_notebook_mode, iplot

import nltk
nltk.download('punct')
nltk.download('stopwords')

init_notebook_mode(connected=True)

## Read data

In [9]:
df_train = pd.read_csv("./data/train.csv", names=["topic", "title", "article"])
df_test = pd.read_csv("./data/test.csv", names=["topic", "title", "article"])

In [10]:
df_train.sample(2)

Unnamed: 0,topic,title,article
92602,4,Dell and Microsoft collaborate to reduce patch...,Dell and Microsoft are to combine their patch ...
36390,1,"Powell, Libya Foreign Minister to Hold First T...",UNITED NATIONS (Reuters) - In a sign of warmi...


In [11]:
df_test.sample(2)

Unnamed: 0,topic,title,article
3282,4,Qatar Defense Show Focuses on Videogame Techno...,DOHA (Reuters) - Rick Bracewell is driving th...
5230,1,Militants holding UN workers say talks have be...,Militants threatening to kill three UN hostage...


## EDA

#### Check class balance

In [13]:
df_train["topic"].value_counts()

3    30000
4    30000
2    30000
1    30000
Name: topic, dtype: int64

In [14]:
df_test["topic"].value_counts()

3    1900
4    1900
2    1900
1    1900
Name: topic, dtype: int64

#### Check if sets have missing values

In [15]:
print(f"""
\ttrain:
{df_train.isna().any()}
\ttest:
{df_test.isna().any()}
""")


	train:
topic      False
title      False
article    False
dtype: bool
	test:
topic      False
title      False
article    False
dtype: bool



#### Compare word count distribution over sets

In [16]:
df_train['title_word_count'] = df_train['title'].apply(lambda x : len(x.split())) # word_tokenize(x) runs long for EDA
df_train['article_word_count'] = df_train['article'].apply(lambda x : len(x.split()))

df_test['title_word_count'] = df_test['title'].apply(lambda x : len(x.split()))
df_test['article_word_count'] = df_test['article'].apply(lambda x : len(x.split()))

In [22]:
trace1 = go.Histogram(x=df_train['title_word_count'], opacity=0.5, histnorm="percent",
                      name="Train title word count")
trace2 = go.Histogram(x=df_test['title_word_count'], opacity=0.5, histnorm="percent",
                      name="Test title word count")


data = [trace1, trace2]
layout = go.Layout(barmode='overlay',
                   title='Word count of titles',
                   xaxis=dict(title='Word count'),
                   yaxis=dict(title='Numer of titles'))

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [23]:
trace3 = go.Histogram(x=df_train['article_word_count'], opacity=0.5, histnorm="percent", 
                      name="Train arcticle word count")
trace4 = go.Histogram(x=df_test['article_word_count'], opacity=0.5, histnorm="percent", 
                      name="Test arcticle word count")


data = [trace3, trace4]
layout = go.Layout(barmode='overlay',
                   title='Word count of arcticles',
                   xaxis=dict(title='Word count'),
                   yaxis=dict(title='Numer of titles'))

fig = go.Figure(data=data, layout=layout)
iplot(fig)

#### Compare most common words in arcticles for each topic

In [80]:
def get_top_words(df, topic, n):
    sw = stopwords.words('english')
    sw.extend(['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', 
               ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', 
               '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '#39;s'])
    
    text = df[df['topic'] == topic]["article"]
    
    words = " ".join(text).split()
    words = [x.lower() for x in words if x.lower() not in sw]
    
    ctnr = Counter(words)
    return ctnr.most_common(n)

In [81]:
print(get_top_words(df_train, 1, 10))
print(get_top_words(df_test, 1, 10))

[('said', 5408), ('president', 3652), ('two', 2895), ('minister', 2867), ('ap', 2768), ('new', 2603), ('(reuters)', 2478), ('iraq', 2466), ('us', 2414), ('killed', 2268)]
[('said', 322), ('president', 234), ('ap', 193), ('two', 171), ('us', 162), ('(reuters)', 160), ('minister', 160), ('new', 160), ('iraq', 154), ('said.', 141)]


In [82]:
print(get_top_words(df_train, 2, 10))
print(get_top_words(df_test, 2, 10))

[('new', 3533), ('first', 3441), ('ap', 3100), ('last', 2622), ('--', 2513), ('world', 2460), ('two', 2409), ('team', 2328), ('one', 2314), ('game', 2303)]
[('first', 231), ('new', 207), ('ap', 195), ('last', 174), ('two', 173), ('one', 169), ('win', 163), ('world', 162), ('--', 162), ('team', 150)]


In [84]:
print(get_top_words(df_train, 3, 10))
print(get_top_words(df_test, 3, 10))

[('said', 6607), ('new', 5714), ('(reuters)', 4296), ('oil', 3953), ('us', 3809), ('inc.', 3515), ('company', 3133), ('york', 2986), ('prices', 2782), ('u.s.', 2608)]
[('said', 425), ('new', 352), ('(reuters)', 260), ('us', 254), ('inc.', 240), ('oil', 220), ('company', 191), ('york', 185), ('prices', 164), ('u.s.', 161)]


In [85]:
print(get_top_words(df_train, 4, 10))
print(get_top_words(df_test, 4, 10))

[('new', 5222), ('said', 3199), ('microsoft', 2766), ('software', 2646), ('internet', 2324), ('company', 2210), ('--', 1839), ('computer', 1786), ('inc.', 1777), ('first', 1765)]
[('new', 327), ('said', 216), ('microsoft', 192), ('internet', 151), ('software', 149), ('company', 146), ('--', 144), ('inc.', 124), ('first', 120), ('computer', 108)]


In [74]:
words = " ".join(text).split()
words = [x.lower() for x in words if x.lower() not in sw]

In [75]:
ctnr = Counter(words)
ctnr.most_common(10)

[('said', 5408),
 ('#39;s', 5190),
 ('president', 3652),
 ('two', 2895),
 ('minister', 2867),
 ('ap', 2768),
 ('new', 2603),
 ('(reuters)', 2478),
 ('iraq', 2466),
 ('us', 2414)]