## Fresh start

In [None]:
from tqdm import tqdm
import pandas as pd
import os
import json
import numpy as np
import datetime
import pytz
import matplotlib.pyplot as plt
import nltk
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
import unicodedata
import re
nltk.download('stopwords')
nltk.download('punkt')
stopwords = nltk.corpus.stopwords.words('english')
from nltk.stem import LancasterStemmer,PorterStemmer

In [None]:
def readData(news_dir):
    data_text = []
    data_published = []
    date = []
    time = []
    site = []
    for folder in os.scandir(news_dir):
        print(os.scandir(news_dir),type(os.scandir(news_dir)))
        if folder.name!=".DS_Store":
            for entry in os.scandir(news_dir+folder.name):
                data = json.load(open(news_dir+folder.name+"/"+entry.name, encoding='utf-8'))
                if data['published']!="":
                    data_published.append(data['published'])
                    date_time = datetime.datetime.strptime(data['published'], "%Y-%m-%dT%H:%M:%S.%f%z").astimezone(pytz.utc)
                    date.append(str(date_time.date()))
                    time.append(str(date_time.time()))
                else:
                    data_published.append("No Value")
                    date.append("No Value")
                    time.append("No Value")
                if data['text']!="":
                    data_text.append(data['text'].lower())
                else:
                    data_text.append("no value")
                    
                if data['thread']['site_full'] != '':
                    site.append(data['thread']['site_full'])
                else:
                    site.append("unknown")
    news_df=pd.DataFrame({'date_time':data_published, 'text':data_text, 'day':date, 'time':time, 'site':site})
    return news_df

In [None]:
news_directory = "../../Data/News/"

news_df = readData(news_directory)

In [None]:
news_df.sort_values(["day", "time"],axis = 0, ascending=[True, True], inplace=True)

In [None]:
def load_stock_price_dataset(path):
    stock_df=pd.read_csv(path,names=['day','time','x1','price','x2','x3','x4'])[['day','time','price']]
    stock_df.day = stock_df.day.apply(lambda s:s.replace('.','-'))
    return stock_df

In [None]:
amazon_stock_price_60 = load_stock_price_dataset('../../Data/CHARTS/AMAZON60.csv')

In [None]:
apple_stock_price_60 = load_stock_price_dataset('../../Data/CHARTS/APPLE60.csv')

In [None]:
def get_label(stocks_df, day, time):
    next_time_step_indexes = stocks_df[(stocks_df.day == day) & (stocks_df.time > time)].index
    if len(next_time_step_indexes) != 0:
        next_time_step_index = next_time_step_indexes[0]
    #    print("pass")
    else:
#         return np.nan
        next_time_step_indexes = stocks_df[(stocks_df.day > day)].index
        if len(next_time_step_indexes) != 0:
            next_time_step_index = next_time_step_indexes[0]
    #       print("pass")
        else:
    #       print("fail")    
            return np.nan
    prev_time_step_index = next_time_step_index - 1 if next_time_step_index>0 else 0
    
    label = 1 if stocks_df.iloc[next_time_step_index].price >= stocks_df.iloc[prev_time_step_index].price else 0
    return label


def assign_labels(news_df, stocks_df_1, stocks_df_2):
    labels_df = pd.DataFrame(columns=["text", "label_1", "label_2", "day", "time", "site"])
    for row_index in tqdm(range(len(news_df))):
        row = news_df.iloc[row_index]
        
        day = row.day
        time = row.time
        text = row.text
        
        label_1 = get_label(stocks_df_1, day, time)
        label_2 = get_label(stocks_df_2, day, time)
        
        labels_df.loc[len(labels_df)] = [text, label_1, label_2, day, time, row.site]
        
        
    return labels_df

In [None]:
labels_df = assign_labels(news_df, amazon_stock_price_60, apple_stock_price_60)

In [None]:
def get_sentences(paragraph):
    sentences = []
    
    first_split = sent_tokenize(paragraph)
    
    for maybe_sentences in first_split:
        our_sentences = maybe_sentences.split("\n")
        sentences.extend(our_sentences)
        
    return sentences

In [None]:
labels_df['sentences'] = labels_df.text.apply(get_sentences)

In [None]:
labels_df['n_sentences'] = labels_df.sentences.apply(lambda s_list: len(s_list))

In [None]:
labels_df.n_sentences.describe()

In [None]:
sentences_df = labels_df.drop(columns=["n_sentences","text"]).explode('sentences').reset_index(drop=True)

In [None]:
sentences_df.shape

In [None]:
sentences_df = sentences_df.dropna(subset=['sentences'])

In [None]:
sentences_df.head()

In [None]:
def nonAsciiChar(words):
    words_list=[]
    for w in words:
        w=re.sub('[^a-zA-Z]+','',re.sub(r'[\W\d]','',w.lower()))
        format_words=unicodedata.normalize('NFKD', w).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        words_list.append(format_words)
        
    return words_list
def stemWordsRemoval(words):
    stemmer = PorterStemmer()
    words_list=[]
    for word in words:
        word=stemmer.stem(word)
        if word not in words_list:
            words_list.append(word)
    return words_list

def stopWordsRemoval(words):
    words_list=[]
    for w in words:
        if w not in stopwords:
            words_list.append(w)
    return words_list

def removeLinks(words):
    words_list=[]
    for w in words:
        if not re.match('[www]',w):
            words_list.append(w)
    return words_list

def spaceRemoval(words):
    words_list=[]
    for w in words:
        if w!='':
            words_list.append(w)
    return words_list


def dataExtraction(words):
    words=nonAsciiChar(words)
    words=spaceRemoval(words)
    words=stopWordsRemoval(words)
    words=stemWordsRemoval(words)
    words=removeLinks(words)
    return words

In [None]:
def process_sentences(sentences_df):
    sentences_df=sentences_df[sentences_df['sentences'].str.match('^[A-Z a-z 0-9]+')]
    sentences_df=sentences_df.drop_duplicates(keep=False).reset_index(drop=True)
    sentences_df['words'] = sentences_df.sentences.apply(word_tokenize)
    sentences_df['words'] = sentences_df.words.apply(dataExtraction)
    sentences_df['text'] = sentences_df.words.apply(lambda words: " ".join(words))
    sentences_df = sentences_df.drop(columns=["words","sentences"])
    return sentences_df

In [None]:
amazon_words = ['amazon', 'amzn']
amazon_news_df = sentences_df[sentences_df.sentences.str.contains("|".join(amazon_words))]
amazon_news_df = process_sentences(amazon_news_df)

In [None]:
apple_words = ['apple', 'aapl']
apple_news_df = sentences_df[sentences_df.sentences.str.contains("|".join(apple_words))]
apple_news_df = process_sentences(apple_news_df)

In [None]:
amazon_news_df.head()

In [None]:
apple_news_df.shape

In [None]:
amazon_labels = amazon_news_df.drop(columns=['label_2'])
amazon_labels = amazon_labels.rename(columns={"label_1":"label"})
amazon_labels = amazon_labels.dropna(subset=['label'])
amazon_labels.reset_index(drop=True, inplace=True)

In [None]:
apple_labels = apple_news_df.drop(columns=['label_1'])
apple_labels = apple_labels.rename(columns={"label_2":"label"})
apple_labels = apple_labels.dropna(subset=['label'])
apple_labels.reset_index(drop=True, inplace = True)

In [None]:
apple_news_df[apple_news_df.label_2.isna()].day.unique()

In [None]:
amazon_news_df[amazon_news_df.label_1.isna()].day.unique()

In [None]:
apple_labels.to_csv("../data/processed/full/apple_labelled_60_special.csv", index = False)

In [None]:
amazon_labels.to_csv("../data/processed/full/amazon_labelled_60_special.csv", index = False)

In [None]:
amazon_stock_price_60.iloc[1].day

In [None]:
news_df.iloc[18]

In [None]:
apple_labels.shape