In [1]:
import warnings
import sys
import os


warnings.filterwarnings('ignore')
current_dir = %pwd

parent_dir = os.path.abspath(os.path.join(current_dir, '../..'))
sys.path.append(parent_dir)

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from src.main.utilities.utils import get_dataset
from src.main.pipeline.pipeline import Pipeline
from src.main.pipeline.functions import stop_words_removal, clean_text, remove_contractions, unify_numbers, tfidf_vectorizer
from wordcloud import WordCloud
from collections import Counter
from yellowbrick.text import TSNEVisualizer
import warnings

warnings.filterwarnings('ignore')

In [3]:
#create_report(dataset)

In [10]:
inputs, targets = get_dataset()
targets_series = pd.Series(targets)
class_counts = targets_series.value_counts()

class_counts.plot(kind='bar')
plt.xlabel('Class')
plt.ylabel('Number of Samples')
plt.title('Number of Samples per Class')
plt.xticks(rotation='horizontal') 
plt.show()


In [86]:
pipeline = Pipeline([remove_contractions, clean_text, stop_words_removal, unify_numbers])
inputs, targets = get_dataset()
results = pipeline.execute(inputs)

In [98]:
# Plot in mean how many caracter are removed after applying the pipeline for each class

text_lengths = [len(text.split()) for text in results.reshape(-1).tolist()]
clean_text_lengths = [len(text.split()) for text in results]
print(np.mean(text_lengths) - np.mean(clean_text_lengths))

dataframe = pd.DataFrame({'full_article': inputs.reshape(-1).tolist(), 'class': targets})

dataframe['clean_text'] = results
dataframe['text_length'] = dataframe['full_article'].apply(lambda x: len(x.split()))
dataframe['clean_text_length'] = dataframe['clean_text'].apply(lambda x: len(x.split()))
dataframe['text_length_diff'] = dataframe['text_length'] - dataframe['clean_text_length']
dataframe['class'] = targets
dataframe.groupby('class')['text_length_diff'].mean()

dataframe.groupby('class')['text_length_diff'].mean().plot(kind='bar')
plt.xlabel('Class')
plt.ylabel('Mean difference in text length')
plt.title('Mean difference in text length after cleaning')
plt.show()




In [99]:
# Plot the number of documents with each length without pipeline

document_lengths = [len(text.split()) for text in inputs.reshape(-1).tolist()]

document_lengths_count = {}
for length in document_lengths:
    if length in document_lengths_count:
        document_lengths_count[length] += 1
    else:
        document_lengths_count[length] = 1

plt.bar(document_lengths_count.keys(), document_lengths_count.values(), width=1)
plt.xlabel('Document Length')
plt.ylabel('Number of Documents')
plt.title('Number of Documents with Each Length without Pipeline')
plt.show()


In [26]:
# Plot the number of documents with each length with pipeline

document_lengths = [len(text.split()) for text in results.reshape(-1).tolist()]

document_lengths_count = {}
for length in document_lengths:
    if length in document_lengths_count:
        document_lengths_count[length] += 1
    else:
        document_lengths_count[length] = 1

plt.bar(document_lengths_count.keys(), document_lengths_count.values(), width=1)
plt.xlabel('Document Length')
plt.ylabel('Number of Documents')
plt.title('Number of Documents with Each Length with Pipeline')
plt.show()

In [101]:
# Variance of text length for every class
dataframe['class'] = targets
dataframe['text_length'] = dataframe['full_article'].apply(lambda x: len(x.split()))
dataframe.groupby('class')['text_length'].var()

# plot
dataframe.groupby('class')['text_length'].var().plot(kind='bar')
plt.xlabel('Class')
plt.ylabel('Variance of text length')
plt.title('Variance of text length for every class')
plt.show()



In [31]:
# Word cloud of the most common words between the classes (with pipeline)

wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = None, 
                min_font_size = 10).generate(' '.join(results.reshape(-1).tolist()))

plt.figure(figsize = (8, 8), facecolor = None)

plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()



In [39]:
# Average document length per class

dict = {}
for i in range(len(targets)):
    if targets[i] in dict:
        dict[targets[i]] += len(results[i].split())
    else:
        dict[targets[i]] = len(results[i].split())

for key in dict:
    dict[key] /= class_counts[key]


plt.bar(dict.keys(), dict.values())
plt.xlabel('Class')
plt.ylabel('Average Document Length')
plt.title('Average Document Length per Class')
plt.xticks(rotation='horizontal')
plt.show()


In [None]:
pipeline = Pipeline([remove_contractions, clean_text, stop_words_removal]) # unify_numbers removed -> generate [NUM] tokens
inputs, targets = get_dataset()
results = pipeline.execute(inputs)

In [56]:
# The three most common words fo each class

dict = {}
for i in range(len(targets)):
    if targets[i] in dict:
        dict[targets[i]] += results[i].split()
    else:
        dict[targets[i]] = results[i].split()

for key in dict:
    dict[key] = Counter(dict[key]).most_common(3)


fig, axs = plt.subplots(1, 5, figsize=(25, 5))
fig.text(0.5, 0.0001, 'Word', ha='center')
fig.text(0.09, 0.5, 'Count', va='center', rotation='vertical')
for i, key in enumerate(dict):
    words = [word[0] for word in dict[key]]
    counts = [word[1] for word in dict[key]]
    axs[i].bar(words, counts)
    axs[i].set_title(key)
    axs[i].set_xticklabels(words, rotation='horizontal')
plt.show()




In [58]:
# Plot a word cloud for each class

dict = {}
for i in range(len(targets)):
    if targets[i] in dict:
        dict[targets[i]] += results[i].split()
    else:
        dict[targets[i]] = results[i].split()

for key in dict:
    dict[key] = Counter(dict[key]).most_common(50)



fig, axs = plt.subplots(1, 5, figsize=(25, 5))
for i, key in enumerate(dict):
    wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                #stopwords = None, 
                min_font_size = 10).generate(' '.join([word[0] for word in dict[key]]))
    axs[i].imshow(wordcloud)
    axs[i].axis("off")
    axs[i].set_title(key)
plt.show()



In [16]:
pipeline = Pipeline([
    remove_contractions,
    clean_text,
    stop_words_removal,
    unify_numbers,
    tfidf_vectorizer
])

inputs, targets = get_dataset()

df = pd.DataFrame({'full_article': inputs.reshape(-1), 'label': targets})
df = df.groupby(targets).head(2500)
df = df[df['label'] != 'Voices']
inputs = df['full_article'].values
targets = df['label'].values
results = pipeline.execute(inputs).reshape(-1)[0]  # [0] to get the sparse matrix


In [17]:
tsne = TSNEVisualizer(colormap='viridis')
tsne.fit(results, targets)
tsne.show()