In [None]:
import pandas as pd
import numpy as np
import datetime
from datetime import datetime as dt
from dateutil.relativedelta import *

from utils.text_analysis_transformers import RemovePunctuation, RemoveNonAscii
from utils.text_analysis_transformers import NltkWordTokenizer, WordLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit
from utils.TimeBasedCV import TimeBasedCV

from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics

from sklearn.svm import SVR

from utils.data_utils import import_reddit_set

# Reddit word count analysis

Aiming to create the table;

|             | Avg Intensity (weighted by the salience of the words in relation to the outcome variable) | Frequency of Words | Customer Confidence | Saving Ratio |
|-------------|---------------------------------------------------------------------------------------------------|----------------------------|-----------------------------|----------------------|
| Correlation |                                                                                                   |                            |                             |                      |
| 1 Month     |                                                                                                   |                            |                             |                      |
| 3 Month     |                                                                                                   |                            |                             |                      |
| 6 Month     |                                                                                                   |                            |                             |                      |
| 9 Months    |                                                                                                   |                            |                             |                      |

## Outputs;
Does each feature correlate with the target variables?
Do the targets correlate with each other? (sanity check)
Do the features have predictive power? 


In [9]:
def import_uk_confidence():
    all_confidence = pd.read_csv('data/consumer_confidence_index.csv',
                               usecols=['TIME', 'Value', 'LOCATION'])

    uk_confidence = all_confidence.loc[all_confidence.LOCATION == "GBR"]

    assert all(pd.value_counts(uk_confidence.TIME) == 1), "duplicate entries for the same time period"

    date = pd.to_datetime(uk_confidence.TIME, format="%Y-%m")

    # clean dataframe:
    df = pd.DataFrame({'date': date, 'value': uk_confidence.Value})

    return df

In [None]:

word_frequency_pipe = Pipeline([
    ('remove_non_ascii', RemoveNonAscii()),
    ('remove_punctuation', RemovePunctuation()),
    ('lemmatize', WordLemmatizer()),
    ('count_vec', CountVectorizer(stop_words='english',
                                  lowercase=True,
                                  preprocessor=None,  # already done above
                                  tokenizer=None,  # nltk_word_tokenizer works here, but let's try without first.
                                  ngram_range=(1, 5),
                                  max_df=0.9,
                                  min_df=0.1,
                                  max_features=None,
                                  vocabulary=None,
                                  binary=False,
                                  encoding='ascii',
                                  strip_accents=None)),
    ])


In [4]:
reddit = import_reddit_set(rows=999999)

In [5]:
reddit.describe()

  """Entry point for launching an IPython kernel.


Unnamed: 0,body,date
count,320579,320579
unique,316721,305597
top,Vaccine,2020-11-09 14:03:30+00:00
freq,81,5
first,,2020-04-01 00:00:03+00:00
last,,2020-11-26 23:59:45+00:00


In [6]:
reddit.head()

Unnamed: 0,body,date
0,Rassistisch ist die Bill and Melinda Gates Sti...,2020-04-27 21:06:26+00:00
1,Dann sag mir doch bitte schön wie man die clin...,2020-04-22 07:43:36+00:00
2,Das Ding ist aber: Das wissen die meisten gar ...,2020-04-22 06:28:45+00:00
3,"Geil, wie ich wieder gedownvotet werde, genaus...",2020-04-21 19:13:13+00:00
4,Die Regierung in den letzten 2 Monaten: ein Wi...,2020-04-20 08:16:25+00:00


In [7]:
reddit.date = pd.to_datetime(reddit.date)
reddit.set_index('date', inplace=True)
reddit.head()

Unnamed: 0_level_0,body
date,Unnamed: 1_level_1
2020-04-27 21:06:26+00:00,Rassistisch ist die Bill and Melinda Gates Sti...
2020-04-22 07:43:36+00:00,Dann sag mir doch bitte schön wie man die clin...
2020-04-22 06:28:45+00:00,Das Ding ist aber: Das wissen die meisten gar ...
2020-04-21 19:13:13+00:00,"Geil, wie ich wieder gedownvotet werde, genaus..."
2020-04-20 08:16:25+00:00,Die Regierung in den letzten 2 Monaten: ein Wi...


In [8]:
reddit['month'] = reddit.index.month
reddit.head()

Unnamed: 0_level_0,body,month
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-04-27 21:06:26+00:00,Rassistisch ist die Bill and Melinda Gates Sti...,4
2020-04-22 07:43:36+00:00,Dann sag mir doch bitte schön wie man die clin...,4
2020-04-22 06:28:45+00:00,Das Ding ist aber: Das wissen die meisten gar ...,4
2020-04-21 19:13:13+00:00,"Geil, wie ich wieder gedownvotet werde, genaus...",4
2020-04-20 08:16:25+00:00,Die Regierung in den letzten 2 Monaten: ein Wi...,4


# But this doesn't look quite right!

In [12]:
pd.unique(reddit.index.month)

array([ 4, 11])

In [16]:
reddit.iloc[70000, :]

body     Izah Azahari\n\nBrunei Darussalam is guarantee...
month                                                   11
Name: 2020-11-24 00:09:10+00:00, dtype: object

## Outcome variable: 

In [10]:
# Import the outcome variable and preprocess
uk_confidence = import_uk_confidence()

uk_confidence.head()

Unnamed: 0,date,value
82,2014-01-01,100.396
83,2014-02-01,100.7097
84,2014-03-01,101.002
85,2014-04-01,101.236
86,2014-05-01,101.3725
