In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pandas as pd
import nltk
import nltk.sentiment
plt.ion()
plt.style.use('seaborn-whitegrid')

In [None]:
STOPWORDS = set(nltk.corpus.stopwords.words('english'))

In [None]:
def clean(text: str) -> list:
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer()
    text = text.encode('ascii', 'ignore').decode('utf-8', 'ignore').lower()
    words = re.sub(r'[^\w\s]', '', text).split() # tokenization
    return [wnl.lemmatize(word) for word in words if word not in STOPWORDS]

In [None]:
# codeup blog articles
df = pd.read_csv('big_blogs.csv')

In [None]:
df['words'] = df.content.apply(clean)

In [None]:
all_words = ' '.join(df.words.apply(' '.join))
# Most common 15 words
pd.Series(all_words.split()).value_counts().head(15)

In [None]:
# 15 Most common bigrams
(
    df.words.apply(nltk.bigrams)
    .apply(pd.Series)
    .stack()
    .reset_index(drop=True)
    .value_counts()
    .head(15)
    .plot.barh(width=.9, ec='black', color='papayawhip')
)

In [None]:
df['title_length'] = df.title.apply(len)
df['content_length'] = df.content.apply(len)
df['word_count'] = df.words.apply(len)

In [None]:
sns.pairplot(df[['title_length', 'content_length', 'word_count']])

In [None]:
sia = nltk.sentiment.SentimentIntensityAnalyzer()
df['sentiment'] = df.content.apply(lambda s: sia.polarity_scores(s)['compound'])

In [None]:
df.sentiment.plot.hist()

In [None]:
# inshorts news articles
df = pd.read_csv('articles.csv').iloc[:, 1:]

In [None]:
df.head(1).T

In [None]:
df['words'] = df.content.apply(clean)

In [None]:
all_words = ' '.join(df.words.apply(' '.join))
# Most common 15 words
pd.Series(all_words.split()).value_counts().head(15)

In [None]:
# 15 Most common bigrams
(
    df.words.apply(nltk.bigrams)
    .apply(pd.Series)
    .stack()
    .reset_index(drop=True)
    .value_counts()
    .head(15)
)

In [None]:
df['title_length'] = df.title.apply(len)
df['content_length'] = df.content.apply(len)
df['word_count'] = df.words.apply(len)

In [None]:
sns.pairplot(df[['title_length', 'content_length', 'word_count']])

In [None]:
sia = nltk.sentiment.SentimentIntensityAnalyzer()
df['sentiment'] = df.content.apply(lambda s: sia.polarity_scores(s)['compound'])

In [None]:
df.sentiment.plot.hist()

In [None]:
df.groupby('topic').describe().T
# no diff in anything but sentiment

In [None]:
df.groupby('topic').sentiment.describe()

In [None]:
sns.boxplot(y='sentiment', x='topic', data=df)

In [None]:
most_negative_article = df.sort_values(by='sentiment').head(1)
most_positive_article = df.sort_values(by='sentiment').tail(1)

In [None]:
print('--- Most negative article\n')
print(most_negative_article.topic.values[0] + ':', most_negative_article.title.values[0])
print()
print(most_negative_article.content.values[0])

In [None]:
print('--- Most positive article\n')
print(most_positive_article.topic.values[0] + ':', most_positive_article.title.values[0])
print()
print(most_positive_article.content.values[0])