In [None]:
import gensim
import nltk
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from collections import Counter
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from raceplotly.plots import barplot
from sklearn.feature_extraction.text import CountVectorizer

## First time users should uncomment the below two lines
    
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')

First, the data is read into a dataframe:

In [None]:
### Get the data

df = pd.read_csv("../../data/final/futurice_blog_data.csv", sep="\t")
# df
df.dropna(subset=["text"], inplace=True)
df.head(5)

# Graphs of the basic statistics
## Average sentence length, with the problematics results removed

In [None]:
plt.hist(df[df["average_sentence_length"] < 300 ]["average_sentence_length"], bins=20)
plt.xlabel('Sentence length')
plt.ylabel('Frequency')
plt.title('Average text length histogram')
plt.show()

## Readability scores

In [None]:
plt.hist(df[df["average_sentence_length"] < 300 ]["flesch"], bins=20)
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.title('Flesch reading ease scores histogram')
plt.show()

In [None]:
plt.hist(df[df["average_sentence_length"] < 300 ]["dale_chall"], bins=20)
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.title('Dale-Chall readability score histogram')
plt.show()

In [None]:
plt.hist(df[df["average_sentence_length"] < 300 ]["text_length"], bins=20)
plt.xlabel('Sentence length')
plt.ylabel('Frequency')
plt.title('Total text length histogram')
plt.show()

# Some additional statistics
These are statistics that might not be able to fit into the csv file

## Most common word in a period of time
In this part, there are some rows that the date is `nan`. For those rows, I just remove them completely

So now we can start doing the real work. But first, let's try to split the data into different month interval 

In [None]:
### Helper functions 
# Generate interval based on a date range
def get_date_interval(startDate, endDate, month_interval):
    s = pd.date_range(start=startDate, end=endDate, freq=str(month_interval)+"MS", inclusive='left')
    e = (s[1:]-pd.to_timedelta(1, unit='D'))
    return list(zip(s.strftime('%Y-%m-%d').tolist(), e.strftime('%Y-%m-%d').tolist() + [endDate]))

# Preprocessing: tokenization, stopwords removal, lemmatization, and stemming
stemmer = SnowballStemmer("english")
def lemmatize_stem(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text, min_len=3):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stem(token))
    return result


def cooking_2(df_main, period=1):
    ## Preprocessing and generating the lists that we will need
    df_ = df_main.copy()
    df_["time"] = pd.to_datetime(df_["time"])
    df_ = df_[df_["time"].notna()]   # Filling the nan rows of the time in dataframe
    df_ = df_.sort_values(by='time',ascending=True)  # Sort the rows by the date
    
    # Get all available date interval
    date_intervals = get_date_interval(df_["time"].iloc[0], df_["time"].iloc[-1], period)
    date_intervals[-1] = (date_intervals[-1][0], date_intervals[-1][1].strftime('%Y-%m-%d'))

    ## The main loop:
    blog_in_range = {}   # The result list
    # Since the helper function does not generate the interval starting from the middle of the month, I have to compromise by manually adding them into the temp list
    temp_list = [df_.iloc[0]["text"], df_.iloc[1]["text"]] 
    index = 0
    i=2   # Thus, we are starting from index 2
    total_blogs = 2

    while i < len(df_["time"]):
        current_row = df_.iloc[i]   # Get the current row of the dataframe
        current_interval = date_intervals[index]  # What is the current interval that we are considering?

        # If the date is in the current interval, add the row to the temp list
        if(datetime.strptime(current_interval[0], '%Y-%m-%d') <= current_row["time"] <= datetime.strptime(current_interval[1], '%Y-%m-%d')):  
            temp_list.append(current_row["text"])
            total_blogs += 1
            i += 1
    
        # If the date is not in the interval, it means that we are going to the next interval, adding the temp list into the result and increment the index
        else:             
            blog_in_range[current_interval] = (temp_list, total_blogs)
            temp_list = []
            total_blogs = 0
            index += 1
    blog_in_range[date_intervals[-1]] = (temp_list, total_blogs)

    ## Final processing to get the results
    blog_in_range = dict(filter(lambda pair: len(pair[1][0]) != 0, blog_in_range.items()))         # Remove the intervals that do not contain any texts
    blog_in_range = { interval[1]:(Counter(preprocess(" ".join(blogs))), count) for (interval, (blogs, count)) in blog_in_range.items() } # Combined all the text in the intervals
    for (interval, (counter, total)) in blog_in_range.items():
        for item, count in counter.items():
            counter[item] /= total
        blog_in_range[interval] = counter
    return blog_in_range

test = cooking_2(df)

In [None]:
pd.DataFrame(test)

## Word trend throughout the months

Here, I create a new function to return a trend graph automatically. The function requires the dataframe, the period length (which is default to 1), the number of bar the users want to appear each time (which is default to 7), and the speed of the graph (defaulted to 500).

In [None]:
def huffing_rpl_2(df, period=1, nbars=10, plength=1000):
    _trend = pd.DataFrame(cooking_2(df, period)).fillna(0).T
    _trend = _trend.melt(ignore_index=False).reset_index().rename(columns={"index":"date"})
    myplot = barplot(_trend, item_column="variable", value_column="value", time_column="date", top_entries=nbars)
    fig = myplot.plot(title="Word popularity by {:d}-month period".format(period), item_label="Words", value_label="Count", time_label="Time: ", frame_duration=plength)
    fig.update_layout(
                font={'size':17},
                plot_bgcolor='black',
                height=600
                )
    fig.write_html("../data/figs/{:d}months.html".format(period))
    return

# huffing_rpl_2(df, period=1)
# huffing_rpl_2(df, period=3)
# huffing_rpl_2(df, period=6)
# huffing_rpl_2(df, period=12)


# Format 
`{ date : ([(word, count)], total_text) }`

In [None]:
def get_blog_in_range(df_main, period=1):
     ## Preprocessing and generating the lists that we will need
    df_ = df_main.copy()
    df_["time"] = pd.to_datetime(df_["time"])
    df_ = df_[df_["time"].notna()]   # Filling the nan rows of the time in dataframe
    df_ = df_.sort_values(by='time',ascending=True)  # Sort the rows by the date
    
    # Get all available date interval
    date_intervals = get_date_interval(df_["time"].iloc[0], df_["time"].iloc[-1], period)
    date_intervals[-1] = (date_intervals[-1][0], date_intervals[-1][1].strftime('%Y-%m-%d'))

    ## The main loop:
    blog_in_range = {}   # The result list
    # Since the helper function does not generate the interval starting from the middle of the month, I have to compromise by manually adding them into the temp list
    temp_list = [df_.iloc[0]["text"], df_.iloc[1]["text"]] 
    index = 0
    i=2   # Thus, we are starting from index 2
    total_blogs = 2

    while i < len(df_["time"]):
        current_row = df_.iloc[i]   # Get the current row of the dataframe
        current_interval = date_intervals[index]  # What is the current interval that we are considering?

        # If the date is in the current interval, add the row to the temp list
        if(datetime.strptime(current_interval[0], '%Y-%m-%d') <= current_row["time"] <= datetime.strptime(current_interval[1], '%Y-%m-%d')):  
            temp_list.append(current_row["text"])
            total_blogs += 1
            i += 1
    
        # If the date is not in the interval, it means that we are going to the next interval, adding the temp list into the result and increment the index
        else:             
            blog_in_range[current_interval] = (temp_list, total_blogs)
            temp_list = []
            total_blogs = 0
            index += 1
    blog_in_range[date_intervals[-1]] = (temp_list, total_blogs)
    blog_in_range = dict(filter(lambda pair: len(pair[1][0]) != 0, blog_in_range.items()))         # Remove the intervals that do not contain any texts
    return { interval[1]:texts for (interval, texts) in blog_in_range.items() }
    

def grillin(df_main, period=1, ngram_range=(2, 2)):
    blog_in_range = get_blog_in_range(df_main, period)
    for (interval, (text_list, total_text)) in blog_in_range.items():
        combined_text = " ".join(text_list)
        vectorizer = CountVectorizer(stop_words='english', ngram_range=ngram_range)
        matrix = vectorizer.fit_transform([combined_text])
        token_with_count = list(zip(vectorizer.get_feature_names_out(), matrix.toarray()[0]))
        token_with_count = list(filter(lambda pair: not pair[0].isnumeric(), token_with_count))
        blog_in_range[interval] = { word:counts/total_text for (word, counts) in token_with_count }
    return blog_in_range

# test2 = grillin(df)

In [None]:
# test2= pd.DataFrame(test2).fillna(0)

In [None]:
def huffing_rpl_3(df, period=1, nbars=10, plength=1000):
    _trend = pd.DataFrame(grillin(df, period)).fillna(0).T
    _trend = _trend.melt(ignore_index=False, value_name='frequency_count').reset_index().rename(columns={"index":"date"})
    myplot = barplot(_trend, item_column="variable", value_column="frequency_count", time_column="date", top_entries=nbars)
    fig = myplot.plot(title="Word popularity by {:d}-month period".format(period), item_label="Words", value_label="Count", time_label="Time: ", frame_duration=plength)
    fig.update_layout(
                font={'size':17},
                plot_bgcolor='black',
                height=600
                )
    fig.write_html("../data/figs/{:d}months_bigrams.html".format(period))
    return

# huffing_rpl_3(df, period=1)
# huffing_rpl_3(df, period=3)
# huffing_rpl_3(df, period=6)
# huffing_rpl_3(df, period=12)

In [None]:
# vectorizer = CountVectorizer(stop_words='english', ngram_range=(2, 2))
# matrix = vectorizer.fit_transform(['technology revolution technological revolutionise'])
# list(zip(vectorizer.get_feature_names_out(), matrix.toarray()[0]))